[2026-01-26 21:45:41,287] [WARNING] [axolotl.utils.trainer.prepare_optim_env:658] [PID:58141] P2P support not detected, setting `NCCL_P2P_DISABLE=1`
[2026-01-26 21:45:41,445] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:58141] baseline 0.000GB ()
[2026-01-26 21:45:41,445] [INFO] [axolotl.cli.config.load_cfg:259] [PID:58141] config:
{
  "accelerator_config": {
    "dispatch_batches": false,
    "split_batches": false
  },
  "activation_offloading": false,
  "adapter": "lora",
  "axolotl_config_path": "./qlora-32b-part2.yaml",
  "base_model": "Guilherme34/secretmodel-indevelopment-full-part1",
  "base_model_config": "Guilherme34/secretmodel-indevelopment-full-part1",
  "batch_size": 2,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_89",
    "fp8": false,
    "n_gpu": 2,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 2,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 54,
  "dataset_prepared_path": "last_run_prepared",
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "evals_per_epoch": 1,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "fsdp": [
    "full_shard",
    "auto_wrap"
  ],
  "fsdp_config": {
    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "cpu_ram_efficient_loading": true,
    "offload_params": true,
    "state_dict_type": "FULL_STATE_DICT",
    "sync_module_states": true,
    "transformer_layer_cls_to_wrap": "Glm4MoeLiteDecoderLayer",
    "use_orig_params": false
  },
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "include_tkps": true,
  "learning_rate": 0.0002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 48,
  "lora_dropout": 0.05,
  "lora_r": 24,
  "lora_target_modules": [
    "gate_proj",
    "down_proj",
    "up_proj",
    "q_proj",
    "v_proj",
    "k_proj",
    "o_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "loss_watchdog_patience": 3,
  "loss_watchdog_threshold": 5.0,
  "lr_scheduler": "cosine",
  "max_steps": 5680,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 1,
  "model_config_type": "glm4_moe_lite",
  "num_epochs": 1.0,
  "optimizer": "adamw_torch_fused",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs/qlora-out",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "pretraining_dataset": [
    {
      "path": "Guilherme34/best-dataset-glm47flash",
      "split": "train",
      "text_column": "text",
      "trust_remote_code": false,
      "type": "pretrain"
    }
  ],
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 1000,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "pad_token": "<|endoftext|>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Guilherme34/secretmodel-indevelopment-full-part1",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 2
}
[2026-01-26 21:45:41,445] [WARNING] [axolotl.cli.checks.check_accelerate_default_config:19] [PID:58141] accelerate config file found at /root/.cache/huggingface/accelerate/default_config.yaml. This can lead to unexpected errors
[2026-01-26 21:45:43,488] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:58141] EOS: 154820 / <|endoftext|>
[2026-01-26 21:45:43,488] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:58141] BOS: None / None
[2026-01-26 21:45:43,488] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:58141] PAD: 154820 / <|endoftext|>
[2026-01-26 21:45:43,489] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:58141] UNK: None / None
[2026-01-26 21:45:45,394] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:58141] loading tokenizer... Guilherme34/secretmodel-indevelopment-full-part1
[2026-01-26 21:45:47,156] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:58141] EOS: 154820 / <|endoftext|>
[2026-01-26 21:45:47,156] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:58141] BOS: None / None
[2026-01-26 21:45:47,156] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:58141] PAD: 154820 / <|endoftext|>
[2026-01-26 21:45:47,156] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:58141] UNK: None / None
[2026-01-26 21:45:47,156] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:58141] Loading model
[2026-01-26 21:45:47,280] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:58141] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-01-26 21:45:47,281] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:58141] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-01-26 21:45:47,281] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:345] [PID:58141] Applying multipack dataloader patch for sample packing...
Loading weights:   0%|                                                                                                                                                                                               | 0/751 [00:00<?, ?it/s]Loading weights:   0%|▏                                                                                                                                               | 1/751 [00:00<00:00, 21959.71it/s, Materializing param=lm_head.weight]Loading weights:   0%|▏                                                                                                                                                | 1/751 [00:00<00:00, 2634.61it/s, Materializing param=lm_head.weight]Loading weights:   0%|▎                                                                                                                                     | 2/751 [00:00<00:00, 3416.95it/s, Materializing param=model.embed_tokens.weight]Loading weights:   0%|▎                                                                                                                                     | 2/751 [00:00<00:00, 2573.19it/s, Materializing param=model.embed_tokens.weight]Loading weights:   0%|▍                                                                                                                         | 3/751 [00:00<00:00, 3040.82it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   0%|▍                                                                                                                         | 3/751 [00:00<00:00, 2718.28it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   1%|▋                                                                                                                           | 4/751 [00:00<00:00, 3009.37it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|▋                                                                                                                           | 4/751 [00:00<00:00, 2832.08it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|▊                                                                                                                           | 5/751 [00:00<00:00, 3294.30it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   1%|▊                                                                                                                           | 5/751 [00:00<00:00, 3147.93it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   1%|█                                                                                                                             | 6/751 [00:00<00:00, 3331.02it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   1%|█                                                                                                                             | 6/751 [00:00<00:00, 3020.74it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   1%|█                                                                                                                | 7/751 [00:00<00:00, 2995.63it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   1%|█                                                                                                                | 7/751 [00:00<00:00, 2832.08it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   1%|█▏                                                                                                               | 8/751 [00:00<00:00, 3063.77it/s, Materializing param=model.layers.0.self_attn.kv_a_layernorm.weight]Loading weights:   1%|█▏                                                                                                               | 8/751 [00:00<00:00, 2977.06it/s, Materializing param=model.layers.0.self_attn.kv_a_layernorm.weight]Loading weights:   1%|█▎                                                                                                           | 9/751 [00:00<00:00, 3154.67it/s, Materializing param=model.layers.0.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   1%|█▎                                                                                                           | 9/751 [00:00<00:00, 2867.57it/s, Materializing param=model.layers.0.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   1%|█▌                                                                                                                   | 10/751 [00:00<00:00, 3013.15it/s, Materializing param=model.layers.0.self_attn.kv_b_proj.weight]Loading weights:   1%|█▌                                                                                                                   | 10/751 [00:00<00:00, 2865.74it/s, Materializing param=model.layers.0.self_attn.kv_b_proj.weight]Loading weights:   1%|█▊                                                                                                                      | 11/751 [00:00<00:00, 2887.01it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   1%|█▊                                                                                                                      | 11/751 [00:00<00:00, 2769.02it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   2%|█▊                                                                                                               | 12/751 [00:00<00:00, 2800.56it/s, Materializing param=model.layers.0.self_attn.q_a_layernorm.weight]Loading weights:   2%|█▊                                                                                                               | 12/751 [00:00<00:00, 2699.62it/s, Materializing param=model.layers.0.self_attn.q_a_layernorm.weight]Loading weights:   2%|██                                                                                                                    | 13/751 [00:00<00:00, 2817.30it/s, Materializing param=model.layers.0.self_attn.q_a_proj.weight]Loading weights:   2%|██                                                                                                                    | 13/751 [00:00<00:00, 2692.51it/s, Materializing param=model.layers.0.self_attn.q_a_proj.weight]Loading weights:   2%|██▏                                                                                                                   | 14/751 [00:00<00:00, 2808.37it/s, Materializing param=model.layers.0.self_attn.q_b_proj.weight]Loading weights:   2%|██▏                                                                                                                   | 14/751 [00:00<00:00, 2645.29it/s, Materializing param=model.layers.0.self_attn.q_b_proj.weight]Loading weights:   2%|██▍                                                                                                                      | 15/751 [00:00<00:00, 2566.79it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   2%|██▍                                                                                                                      | 15/751 [00:00<00:00, 2355.73it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   2%|██▌                                                                                                                       | 16/751 [00:00<00:00, 2381.86it/s, Materializing param=model.layers.1.mlp.experts.down_proj]Loading weights:   2%|██▌                                                                                                                       | 16/751 [00:00<00:00, 2314.66it/s, Materializing param=model.layers.1.mlp.experts.down_proj]Loading weights:   0%|                                                                                                                                                                                               | 0/751 [00:00<?, ?it/s]Loading weights:   0%|▏                                                                                                                                               | 1/751 [00:00<00:00, 21509.25it/s, Materializing param=lm_head.weight]Loading weights:   0%|▏                                                                                                                                                | 1/751 [00:00<00:00, 3421.13it/s, Materializing param=lm_head.weight]Loading weights:   0%|▎                                                                                                                                     | 2/751 [00:00<00:00, 3442.19it/s, Materializing param=model.embed_tokens.weight]Loading weights:   0%|▎                                                                                                                                     | 2/751 [00:00<00:00, 2909.68it/s, Materializing param=model.embed_tokens.weight]Loading weights:   0%|▍                                                                                                                         | 3/751 [00:00<00:00, 3405.39it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   0%|▍                                                                                                                         | 3/751 [00:00<00:00, 2957.21it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   1%|▋                                                                                                                           | 4/751 [00:00<00:00, 2975.21it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|▋                                                                                                                           | 4/751 [00:00<00:00, 2587.48it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|▊                                                                                                                           | 5/751 [00:00<00:00, 2850.55it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   1%|▊                                                                                                                           | 5/751 [00:00<00:00, 2659.00it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   1%|█                                                                                                                             | 6/751 [00:00<00:00, 2806.49it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   1%|█                                                                                                                             | 6/751 [00:00<00:00, 2566.37it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   1%|█                                                                                                                | 7/751 [00:00<00:00, 2832.90it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   1%|█                                                                                                                | 7/751 [00:00<00:00, 2691.61it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   1%|█▏                                                                                                               | 8/751 [00:00<00:00, 2898.62it/s, Materializing param=model.layers.0.self_attn.kv_a_layernorm.weight]Loading weights:   1%|█▏                                                                                                               | 8/751 [00:00<00:00, 2744.29it/s, Materializing param=model.layers.0.self_attn.kv_a_layernorm.weight]Loading weights:   1%|█▎                                                                                                           | 9/751 [00:00<00:00, 2846.38it/s, Materializing param=model.layers.0.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   1%|█▎                                                                                                           | 9/751 [00:00<00:00, 2646.06it/s, Materializing param=model.layers.0.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   1%|█▌                                                                                                                   | 10/751 [00:00<00:00, 2689.69it/s, Materializing param=model.layers.0.self_attn.kv_b_proj.weight]Loading weights:   1%|█▌                                                                                                                   | 10/751 [00:00<00:00, 2553.77it/s, Materializing param=model.layers.0.self_attn.kv_b_proj.weight]Loading weights:   1%|█▊                                                                                                                      | 11/751 [00:00<00:00, 2599.14it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   1%|█▊                                                                                                                      | 11/751 [00:00<00:00, 2552.97it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   2%|█▊                                                                                                               | 12/751 [00:00<00:00, 2709.35it/s, Materializing param=model.layers.0.self_attn.q_a_layernorm.weight]Loading weights:   2%|█▊                                                                                                               | 12/751 [00:00<00:00, 2659.39it/s, Materializing param=model.layers.0.self_attn.q_a_layernorm.weight]Loading weights:   2%|██                                                                                                                    | 13/751 [00:00<00:00, 2758.85it/s, Materializing param=model.layers.0.self_attn.q_a_proj.weight]Loading weights:   2%|██                                                                                                                    | 13/751 [00:00<00:00, 2710.17it/s, Materializing param=model.layers.0.self_attn.q_a_proj.weight]Loading weights:   2%|██▏                                                                                                                   | 14/751 [00:00<00:00, 2774.27it/s, Materializing param=model.layers.0.self_attn.q_b_proj.weight]Loading weights:   2%|██▏                                                                                                                   | 14/751 [00:00<00:00, 2666.19it/s, Materializing param=model.layers.0.self_attn.q_b_proj.weight]Loading weights:   2%|██▍                                                                                                                      | 15/751 [00:00<00:00, 2794.71it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   2%|██▍                                                                                                                      | 15/751 [00:00<00:00, 2730.43it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   2%|██▌                                                                                                                       | 16/751 [00:00<00:00, 2840.83it/s, Materializing param=model.layers.1.mlp.experts.down_proj]Loading weights:   2%|██▌                                                                                                                       | 16/751 [00:00<00:00, 2788.77it/s, Materializing param=model.layers.1.mlp.experts.down_proj]Loading weights:   2%|██▊                                                                                                                         | 17/751 [00:00<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.experts.down_proj]Loading weights:   2%|██▊                                                                                                                         | 17/751 [00:00<00:37, 19.67it/s, Materializing param=model.layers.1.mlp.experts.down_proj]Loading weights:   2%|██▋                                                                                                                      | 17/751 [00:00<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.experts.gate_up_proj]Loading weights:   2%|██▋                                                                                                                      | 17/751 [00:00<00:37, 19.67it/s, Materializing param=model.layers.1.mlp.experts.gate_up_proj]Loading weights:   2%|██▋                                                                                                                      | 17/751 [00:00<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.experts.gate_up_proj]Loading weights:   2%|██▋                                                                                                                      | 17/751 [00:00<00:37, 19.67it/s, Materializing param=model.layers.1.mlp.experts.gate_up_proj]Loading weights:   2%|██▋                                                                                                              | 18/751 [00:03<00:37, 19.67it/s, Materializing param=model.layers.1.mlp.gate.e_score_correction_bias]Loading weights:   2%|██▋                                                                                                              | 18/751 [00:03<00:37, 19.67it/s, Materializing param=model.layers.1.mlp.gate.e_score_correction_bias]Loading weights:   3%|██▊                                                                                                              | 19/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.gate.e_score_correction_bias]Loading weights:   3%|███▎                                                                                                                              | 19/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.gate.weight]Loading weights:   3%|███▎                                                                                                                              | 19/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.gate.weight]Loading weights:   3%|██▉                                                                                                           | 20/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.shared_experts.down_proj.weight]Loading weights:   3%|██▉                                                                                                           | 20/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.shared_experts.down_proj.weight]Loading weights:   3%|███                                                                                                           | 21/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.shared_experts.gate_proj.weight]Loading weights:   3%|███                                                                                                           | 21/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.shared_experts.gate_proj.weight]Loading weights:   3%|███▎                                                                                                            | 22/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.shared_experts.up_proj.weight]Loading weights:   3%|███▎                                                                                                            | 22/751 [00:03<02:26,  4.99it/s, Materializing param=model.layers.1.mlp.shared_experts.up_proj.weight]Loading weights:   3%|███▍                                                                                                              | 23/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   3%|███▍                                                                                                              | 23/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   3%|███▋                                                                                                              | 24/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.kv_a_layernorm.weight]Loading weights:   3%|███▋                                                                                                              | 24/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.kv_a_layernorm.weight]Loading weights:   3%|███▋                                                                                                          | 25/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   3%|███▋                                                                                                          | 25/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   3%|████                                                                                                                   | 26/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.kv_b_proj.weight]Loading weights:   3%|████                                                                                                                   | 26/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.kv_b_proj.weight]Loading weights:   4%|████▍                                                                                                                     | 27/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   4%|████▍                                                                                                                     | 27/751 [00:03<02:25,  4.99it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   4%|████▎                                                                                                              | 28/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.1.self_attn.q_a_layernorm.weight]Loading weights:   4%|████▎                                                                                                              | 28/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.1.self_attn.q_a_layernorm.weight]Loading weights:   4%|████▋                                                                                                                   | 29/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.1.self_attn.q_a_proj.weight]Loading weights:   4%|████▋                                                                                                                   | 29/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.1.self_attn.q_a_proj.weight]Loading weights:   4%|████▊                                                                                                                   | 30/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.1.self_attn.q_b_proj.weight]Loading weights:   4%|████▊                                                                                                                   | 30/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.1.self_attn.q_b_proj.weight]Loading weights:   4%|█████                                                                                                                      | 31/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   4%|█████                                                                                                                      | 31/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   4%|█████▎                                                                                                                      | 32/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.2.mlp.experts.down_proj]Loading weights:   4%|█████▎                                                                                                                      | 32/751 [00:03<02:24,  4.99it/s, Materializing param=model.layers.2.mlp.experts.down_proj]Loading weights:   2%|██▋                                                                                                              | 18/751 [00:02<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.gate.e_score_correction_bias]Loading weights:   2%|██▋                                                                                                              | 18/751 [00:02<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.gate.e_score_correction_bias]Loading weights:   3%|███▎                                                                                                                              | 19/751 [00:02<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.gate.weight]Loading weights:   3%|███▎                                                                                                                              | 19/751 [00:02<00:24, 29.65it/s, Materializing param=model.layers.1.mlp.gate.weight]Loading weights:   3%|███▍                                                                                                                              | 20/751 [00:02<02:07,  5.75it/s, Materializing param=model.layers.1.mlp.gate.weight]Loading weights:   3%|██▉                                                                                                           | 20/751 [00:02<02:07,  5.75it/s, Materializing param=model.layers.1.mlp.shared_experts.down_proj.weight]Loading weights:   3%|██▉                                                                                                           | 20/751 [00:02<02:07,  5.75it/s, Materializing param=model.layers.1.mlp.shared_experts.down_proj.weight]Loading weights:   3%|███                                                                                                           | 21/751 [00:02<02:07,  5.75it/s, Materializing param=model.layers.1.mlp.shared_experts.gate_proj.weight]Loading weights:   3%|███                                                                                                           | 21/751 [00:02<02:07,  5.75it/s, Materializing param=model.layers.1.mlp.shared_experts.gate_proj.weight]Loading weights:   3%|███▎                                                                                                            | 22/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.mlp.shared_experts.up_proj.weight]Loading weights:   3%|███▎                                                                                                            | 22/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.mlp.shared_experts.up_proj.weight]Loading weights:   3%|███▍                                                                                                              | 23/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   3%|███▍                                                                                                              | 23/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   3%|███▋                                                                                                              | 24/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.self_attn.kv_a_layernorm.weight]Loading weights:   3%|███▋                                                                                                              | 24/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.self_attn.kv_a_layernorm.weight]Loading weights:   3%|███▋                                                                                                          | 25/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   3%|███▋                                                                                                          | 25/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   3%|████                                                                                                                   | 26/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.self_attn.kv_b_proj.weight]Loading weights:   3%|████                                                                                                                   | 26/751 [00:02<02:06,  5.75it/s, Materializing param=model.layers.1.self_attn.kv_b_proj.weight]Loading weights:   4%|████▍                                                                                                                     | 27/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   4%|████▍                                                                                                                     | 27/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   4%|████▎                                                                                                              | 28/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.q_a_layernorm.weight]Loading weights:   4%|████▎                                                                                                              | 28/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.q_a_layernorm.weight]Loading weights:   4%|████▋                                                                                                                   | 29/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.q_a_proj.weight]Loading weights:   4%|████▋                                                                                                                   | 29/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.q_a_proj.weight]Loading weights:   4%|████▊                                                                                                                   | 30/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.q_b_proj.weight]Loading weights:   4%|████▊                                                                                                                   | 30/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.1.self_attn.q_b_proj.weight]Loading weights:   4%|█████                                                                                                                      | 31/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   4%|█████                                                                                                                      | 31/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   4%|█████▎                                                                                                                      | 32/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.2.mlp.experts.down_proj]Loading weights:   4%|█████▎                                                                                                                      | 32/751 [00:02<02:05,  5.75it/s, Materializing param=model.layers.2.mlp.experts.down_proj]Loading weights:   4%|█████▍                                                                                                                      | 33/751 [00:03<01:27,  8.19it/s, Materializing param=model.layers.2.mlp.experts.down_proj]Loading weights:   4%|█████▍                                                                                                                      | 33/751 [00:03<01:24,  8.52it/s, Materializing param=model.layers.2.mlp.experts.down_proj]Loading weights:   4%|█████▎                                                                                                                   | 33/751 [00:03<01:27,  8.19it/s, Materializing param=model.layers.2.mlp.experts.gate_up_proj]Loading weights:   4%|█████▎                                                                                                                   | 33/751 [00:03<01:24,  8.52it/s, Materializing param=model.layers.2.mlp.experts.gate_up_proj]Loading weights:   4%|█████▎                                                                                                                   | 33/751 [00:03<01:27,  8.19it/s, Materializing param=model.layers.2.mlp.experts.gate_up_proj]Loading weights:   4%|█████▎                                                                                                                   | 33/751 [00:03<01:24,  8.52it/s, Materializing param=model.layers.2.mlp.experts.gate_up_proj]Loading weights:   5%|█████                                                                                                            | 34/751 [00:05<01:24,  8.52it/s, Materializing param=model.layers.2.mlp.gate.e_score_correction_bias]Loading weights:   5%|█████                                                                                                            | 34/751 [00:05<01:24,  8.52it/s, Materializing param=model.layers.2.mlp.gate.e_score_correction_bias]Loading weights:   5%|█████▎                                                                                                           | 35/751 [00:05<02:38,  4.53it/s, Materializing param=model.layers.2.mlp.gate.e_score_correction_bias]Loading weights:   5%|██████                                                                                                                            | 35/751 [00:05<02:38,  4.53it/s, Materializing param=model.layers.2.mlp.gate.weight]Loading weights:   5%|██████                                                                                                                            | 35/751 [00:05<02:38,  4.53it/s, Materializing param=model.layers.2.mlp.gate.weight]Loading weights:   5%|█████▎                                                                                                        | 36/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.mlp.shared_experts.down_proj.weight]Loading weights:   5%|█████▎                                                                                                        | 36/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.mlp.shared_experts.down_proj.weight]Loading weights:   5%|█████▍                                                                                                        | 37/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.mlp.shared_experts.gate_proj.weight]Loading weights:   5%|█████▍                                                                                                        | 37/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.mlp.shared_experts.gate_proj.weight]Loading weights:   5%|█████▋                                                                                                          | 38/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.mlp.shared_experts.up_proj.weight]Loading weights:   5%|█████▋                                                                                                          | 38/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.mlp.shared_experts.up_proj.weight]Loading weights:   5%|█████▉                                                                                                            | 39/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   5%|█████▉                                                                                                            | 39/751 [00:05<02:37,  4.53it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   5%|██████                                                                                                            | 40/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.kv_a_layernorm.weight]Loading weights:   5%|██████                                                                                                            | 40/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.kv_a_layernorm.weight]Loading weights:   5%|██████                                                                                                        | 41/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   5%|██████                                                                                                        | 41/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   6%|██████▋                                                                                                                | 42/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.kv_b_proj.weight]Loading weights:   6%|██████▋                                                                                                                | 42/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.kv_b_proj.weight]Loading weights:   6%|██████▉                                                                                                                   | 43/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:   6%|██████▉                                                                                                                   | 43/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:   6%|██████▋                                                                                                            | 44/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.q_a_layernorm.weight]Loading weights:   6%|██████▋                                                                                                            | 44/751 [00:05<02:36,  4.53it/s, Materializing param=model.layers.2.self_attn.q_a_layernorm.weight]Loading weights:   6%|███████▏                                                                                                                | 45/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.2.self_attn.q_a_proj.weight]Loading weights:   6%|███████▏                                                                                                                | 45/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.2.self_attn.q_a_proj.weight]Loading weights:   6%|███████▎                                                                                                                | 46/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.2.self_attn.q_b_proj.weight]Loading weights:   6%|███████▎                                                                                                                | 46/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.2.self_attn.q_b_proj.weight]Loading weights:   6%|███████▋                                                                                                                   | 47/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:   6%|███████▋                                                                                                                   | 47/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:   6%|███████▉                                                                                                                    | 48/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.3.mlp.experts.down_proj]Loading weights:   6%|███████▉                                                                                                                    | 48/751 [00:05<02:35,  4.53it/s, Materializing param=model.layers.3.mlp.experts.down_proj]Loading weights:   5%|█████                                                                                                            | 34/751 [00:06<01:27,  8.19it/s, Materializing param=model.layers.2.mlp.gate.e_score_correction_bias]Loading weights:   5%|█████                                                                                                            | 34/751 [00:06<01:27,  8.19it/s, Materializing param=model.layers.2.mlp.gate.e_score_correction_bias]Loading weights:   5%|█████▎                                                                                                           | 35/751 [00:06<02:40,  4.47it/s, Materializing param=model.layers.2.mlp.gate.e_score_correction_bias]Loading weights:   5%|██████                                                                                                                            | 35/751 [00:06<02:40,  4.47it/s, Materializing param=model.layers.2.mlp.gate.weight]Loading weights:   5%|██████                                                                                                                            | 35/751 [00:06<02:40,  4.47it/s, Materializing param=model.layers.2.mlp.gate.weight]Loading weights:   5%|█████▎                                                                                                        | 36/751 [00:06<02:40,  4.47it/s, Materializing param=model.layers.2.mlp.shared_experts.down_proj.weight]Loading weights:   5%|█████▎                                                                                                        | 36/751 [00:06<02:40,  4.47it/s, Materializing param=model.layers.2.mlp.shared_experts.down_proj.weight]Loading weights:   5%|█████▍                                                                                                        | 37/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.mlp.shared_experts.gate_proj.weight]Loading weights:   5%|█████▍                                                                                                        | 37/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.mlp.shared_experts.gate_proj.weight]Loading weights:   5%|█████▋                                                                                                          | 38/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.mlp.shared_experts.up_proj.weight]Loading weights:   5%|█████▋                                                                                                          | 38/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.mlp.shared_experts.up_proj.weight]Loading weights:   5%|█████▉                                                                                                            | 39/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   5%|█████▉                                                                                                            | 39/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   5%|██████                                                                                                            | 40/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.self_attn.kv_a_layernorm.weight]Loading weights:   5%|██████                                                                                                            | 40/751 [00:06<02:39,  4.47it/s, Materializing param=model.layers.2.self_attn.kv_a_layernorm.weight]Loading weights:   5%|██████                                                                                                        | 41/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   5%|██████                                                                                                        | 41/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   6%|██████▋                                                                                                                | 42/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.kv_b_proj.weight]Loading weights:   6%|██████▋                                                                                                                | 42/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.kv_b_proj.weight]Loading weights:   6%|██████▉                                                                                                                   | 43/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:   6%|██████▉                                                                                                                   | 43/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:   6%|██████▋                                                                                                            | 44/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.q_a_layernorm.weight]Loading weights:   6%|██████▋                                                                                                            | 44/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.q_a_layernorm.weight]Loading weights:   6%|███████▏                                                                                                                | 45/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.q_a_proj.weight]Loading weights:   6%|███████▏                                                                                                                | 45/751 [00:06<02:38,  4.47it/s, Materializing param=model.layers.2.self_attn.q_a_proj.weight]Loading weights:   6%|███████▎                                                                                                                | 46/751 [00:06<02:37,  4.47it/s, Materializing param=model.layers.2.self_attn.q_b_proj.weight]Loading weights:   6%|███████▎                                                                                                                | 46/751 [00:06<02:37,  4.47it/s, Materializing param=model.layers.2.self_attn.q_b_proj.weight]Loading weights:   6%|███████▋                                                                                                                   | 47/751 [00:06<02:37,  4.47it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:   6%|███████▋                                                                                                                   | 47/751 [00:06<02:37,  4.47it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:   6%|███████▉                                                                                                                    | 48/751 [00:06<02:37,  4.47it/s, Materializing param=model.layers.3.mlp.experts.down_proj]Loading weights:   6%|███████▉                                                                                                                    | 48/751 [00:06<02:37,  4.47it/s, Materializing param=model.layers.3.mlp.experts.down_proj]Loading weights:   7%|████████                                                                                                                    | 49/751 [00:06<01:39,  7.08it/s, Materializing param=model.layers.3.mlp.experts.down_proj]Loading weights:   7%|████████                                                                                                                    | 49/751 [00:07<01:40,  6.99it/s, Materializing param=model.layers.3.mlp.experts.down_proj]Loading weights:   7%|███████▉                                                                                                                 | 49/751 [00:06<01:39,  7.08it/s, Materializing param=model.layers.3.mlp.experts.gate_up_proj]Loading weights:   7%|███████▉                                                                                                                 | 49/751 [00:07<01:40,  6.99it/s, Materializing param=model.layers.3.mlp.experts.gate_up_proj]Loading weights:   7%|███████▉                                                                                                                 | 49/751 [00:07<01:40,  6.99it/s, Materializing param=model.layers.3.mlp.experts.gate_up_proj]Loading weights:   7%|███████▉                                                                                                                 | 49/751 [00:06<01:39,  7.08it/s, Materializing param=model.layers.3.mlp.experts.gate_up_proj]Loading weights:   7%|████████                                                                                                                 | 50/751 [00:09<02:51,  4.10it/s, Materializing param=model.layers.3.mlp.experts.gate_up_proj]Loading weights:   7%|███████▌                                                                                                         | 50/751 [00:08<01:39,  7.08it/s, Materializing param=model.layers.3.mlp.gate.e_score_correction_bias]Loading weights:   7%|███████▌                                                                                                         | 50/751 [00:08<01:39,  7.08it/s, Materializing param=model.layers.3.mlp.gate.e_score_correction_bias]Loading weights:   7%|███████▌                                                                                                         | 50/751 [00:09<02:51,  4.10it/s, Materializing param=model.layers.3.mlp.gate.e_score_correction_bias]Loading weights:   7%|███████▋                                                                                                         | 51/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.gate.e_score_correction_bias]Loading weights:   7%|███████▌                                                                                                         | 50/751 [00:09<02:51,  4.10it/s, Materializing param=model.layers.3.mlp.gate.e_score_correction_bias]Loading weights:   7%|████████▊                                                                                                                         | 51/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.gate.weight]Loading weights:   7%|████████▊                                                                                                                         | 51/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.gate.weight]Loading weights:   7%|████████▊                                                                                                                         | 51/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.gate.weight]Loading weights:   7%|████████▊                                                                                                                         | 51/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.gate.weight]Loading weights:   7%|███████▌                                                                                                      | 52/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.shared_experts.down_proj.weight]Loading weights:   7%|███████▌                                                                                                      | 52/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.shared_experts.down_proj.weight]Loading weights:   7%|███████▌                                                                                                      | 52/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.shared_experts.down_proj.weight]Loading weights:   7%|███████▊                                                                                                      | 53/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.shared_experts.gate_proj.weight]Loading weights:   7%|███████▌                                                                                                      | 52/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.shared_experts.down_proj.weight]Loading weights:   7%|███████▊                                                                                                      | 53/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.shared_experts.gate_proj.weight]Loading weights:   7%|███████▊                                                                                                      | 53/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.shared_experts.gate_proj.weight]Loading weights:   7%|████████                                                                                                        | 54/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.shared_experts.up_proj.weight]Loading weights:   7%|███████▊                                                                                                      | 53/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.shared_experts.gate_proj.weight]Loading weights:   7%|████████                                                                                                        | 54/751 [00:08<02:41,  4.33it/s, Materializing param=model.layers.3.mlp.shared_experts.up_proj.weight]Loading weights:   7%|████████                                                                                                        | 54/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.shared_experts.up_proj.weight]Loading weights:   7%|████████▎                                                                                                         | 55/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:   7%|████████                                                                                                        | 54/751 [00:09<02:50,  4.10it/s, Materializing param=model.layers.3.mlp.shared_experts.up_proj.weight]Loading weights:   7%|████████▎                                                                                                         | 55/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:   7%|████████▌                                                                                                         | 56/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.self_attn.kv_a_layernorm.weight]Loading weights:   7%|████████▎                                                                                                         | 55/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:   7%|████████▌                                                                                                         | 56/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.self_attn.kv_a_layernorm.weight]Loading weights:   7%|████████▎                                                                                                         | 55/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:   8%|████████▎                                                                                                     | 57/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   8%|████████▎                                                                                                     | 57/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   7%|████████▌                                                                                                         | 56/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.self_attn.kv_a_layernorm.weight]Loading weights:   8%|█████████▏                                                                                                             | 58/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.self_attn.kv_b_proj.weight]Loading weights:   8%|█████████▏                                                                                                             | 58/751 [00:08<02:40,  4.33it/s, Materializing param=model.layers.3.self_attn.kv_b_proj.weight]Loading weights:   7%|████████▌                                                                                                         | 56/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.self_attn.kv_a_layernorm.weight]Loading weights:   8%|█████████▌                                                                                                                | 59/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:   8%|█████████▌                                                                                                                | 59/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:   8%|█████████▏                                                                                                         | 60/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.q_a_layernorm.weight]Loading weights:   8%|████████▎                                                                                                     | 57/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   8%|█████████▏                                                                                                         | 60/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.q_a_layernorm.weight]Loading weights:   8%|████████▎                                                                                                     | 57/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.self_attn.kv_a_proj_with_mqa.weight]Loading weights:   8%|█████████▋                                                                                                              | 61/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.q_a_proj.weight]Loading weights:   8%|█████████▏                                                                                                             | 58/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.self_attn.kv_b_proj.weight]Loading weights:   8%|█████████▋                                                                                                              | 61/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.q_a_proj.weight]Loading weights:   8%|█████████▏                                                                                                             | 58/751 [00:09<02:49,  4.10it/s, Materializing param=model.layers.3.self_attn.kv_b_proj.weight]Loading weights:   8%|█████████▉                                                                                                              | 62/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.q_b_proj.weight]Loading weights:   8%|█████████▌                                                                                                                | 59/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:   8%|█████████▉                                                                                                              | 62/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.3.self_attn.q_b_proj.weight]Loading weights:   8%|█████████▌                                                                                                                | 59/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:   8%|██████████▎                                                                                                                | 63/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:   8%|█████████▏                                                                                                         | 60/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.q_a_layernorm.weight]Loading weights:   8%|██████████▎                                                                                                                | 63/751 [00:08<02:39,  4.33it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:   8%|█████████▏                                                                                                         | 60/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.q_a_layernorm.weight]Loading weights:   9%|██████████▌                                                                                                                 | 64/751 [00:08<02:38,  4.33it/s, Materializing param=model.layers.4.mlp.experts.down_proj]Loading weights:   9%|██████████▌                                                                                                                 | 64/751 [00:08<02:38,  4.33it/s, Materializing param=model.layers.4.mlp.experts.down_proj]Loading weights:   8%|█████████▋                                                                                                              | 61/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.q_a_proj.weight]Loading weights:   8%|█████████▋                                                                                                              | 61/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.q_a_proj.weight]Loading weights:   8%|█████████▉                                                                                                              | 62/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.q_b_proj.weight]Loading weights:   8%|█████████▉                                                                                                              | 62/751 [00:09<02:48,  4.10it/s, Materializing param=model.layers.3.self_attn.q_b_proj.weight]Loading weights:   8%|██████████▎                                                                                                                | 63/751 [00:09<02:47,  4.10it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:   8%|██████████▎                                                                                                                | 63/751 [00:09<02:47,  4.10it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:   9%|██████████▌                                                                                                                 | 64/751 [00:09<02:47,  4.10it/s, Materializing param=model.layers.4.mlp.experts.down_proj]Loading weights:   9%|██████████▌                                                                                                                 | 64/751 [00:09<02:47,  4.10it/s, Materializing param=model.layers.4.mlp.experts.down_proj]Loading weights:   9%|██████████▋                                                                                                                 | 65/751 [00:09<01:44,  6.58it/s, Materializing param=model.layers.4.mlp.experts.down_proj]Loading weights:   9%|██████████▋                                                                                                                 | 65/751 [00:10<01:43,  6.61it/s, Materializing param=model.layers.4.mlp.experts.down_proj]Loading weights:   9%|██████████▍                                                                                                              | 65/751 [00:09<01:44,  6.58it/s, Materializing param=model.layers.4.mlp.experts.gate_up_proj]Loading weights:   9%|██████████▍                                                                                                              | 65/751 [00:10<01:43,  6.61it/s, Materializing param=model.layers.4.mlp.experts.gate_up_proj]Loading weights:   9%|██████████▍                                                                                                              | 65/751 [00:09<01:44,  6.58it/s, Materializing param=model.layers.4.mlp.experts.gate_up_proj]Loading weights:   9%|██████████▍                                                                                                              | 65/751 [00:10<01:43,  6.61it/s, Materializing param=model.layers.4.mlp.experts.gate_up_proj]Loading weights:   9%|██████████▋                                                                                                              | 66/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.experts.gate_up_proj]Loading weights:   9%|█████████▉                                                                                                       | 66/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.gate.e_score_correction_bias]Loading weights:   9%|█████████▉                                                                                                       | 66/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.gate.e_score_correction_bias]Loading weights:   9%|███████████▌                                                                                                                      | 67/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.gate.weight]Loading weights:   9%|███████████▌                                                                                                                      | 67/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.gate.weight]Loading weights:   9%|█████████▉                                                                                                    | 68/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.shared_experts.down_proj.weight]Loading weights:   9%|█████████▉                                                                                                    | 68/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.shared_experts.down_proj.weight]Loading weights:   9%|██████████                                                                                                    | 69/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.shared_experts.gate_proj.weight]Loading weights:   9%|██████████                                                                                                    | 69/751 [00:12<02:48,  4.06it/s, Materializing param=model.layers.4.mlp.shared_experts.gate_proj.weight]Loading weights:   9%|██████████▍                                                                                                     | 70/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.mlp.shared_experts.up_proj.weight]Loading weights:   9%|██████████▍                                                                                                     | 70/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.mlp.shared_experts.up_proj.weight]Loading weights:   9%|██████████▊                                                                                                       | 71/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:   9%|██████████▊                                                                                                       | 71/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:  10%|██████████▉                                                                                                       | 72/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.self_attn.kv_a_layernorm.weight]Loading weights:  10%|██████████▉                                                                                                       | 72/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.self_attn.kv_a_layernorm.weight]Loading weights:  10%|██████████▋                                                                                                   | 73/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  10%|██████████▋                                                                                                   | 73/751 [00:12<02:47,  4.06it/s, Materializing param=model.layers.4.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  10%|███████████▋                                                                                                           | 74/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.kv_b_proj.weight]Loading weights:  10%|███████████▋                                                                                                           | 74/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.kv_b_proj.weight]Loading weights:  10%|████████████▏                                                                                                             | 75/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  10%|████████████▏                                                                                                             | 75/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  10%|███████████▋                                                                                                       | 76/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.q_a_layernorm.weight]Loading weights:  10%|███████████▋                                                                                                       | 76/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.q_a_layernorm.weight]Loading weights:  10%|████████████▎                                                                                                           | 77/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.q_a_proj.weight]Loading weights:  10%|████████████▎                                                                                                           | 77/751 [00:12<02:46,  4.06it/s, Materializing param=model.layers.4.self_attn.q_a_proj.weight]Loading weights:  10%|████████████▍                                                                                                           | 78/751 [00:12<02:45,  4.06it/s, Materializing param=model.layers.4.self_attn.q_b_proj.weight]Loading weights:  10%|████████████▍                                                                                                           | 78/751 [00:12<02:45,  4.06it/s, Materializing param=model.layers.4.self_attn.q_b_proj.weight]Loading weights:  11%|████████████▉                                                                                                              | 79/751 [00:12<02:45,  4.06it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  11%|████████████▉                                                                                                              | 79/751 [00:12<02:45,  4.06it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  11%|█████████████▏                                                                                                              | 80/751 [00:12<02:45,  4.06it/s, Materializing param=model.layers.5.mlp.experts.down_proj]Loading weights:  11%|█████████████▏                                                                                                              | 80/751 [00:12<02:45,  4.06it/s, Materializing param=model.layers.5.mlp.experts.down_proj]Loading weights:   9%|██████████▋                                                                                                              | 66/751 [00:12<02:50,  4.02it/s, Materializing param=model.layers.4.mlp.experts.gate_up_proj]Loading weights:   9%|█████████▉                                                                                                       | 66/751 [00:12<02:50,  4.02it/s, Materializing param=model.layers.4.mlp.gate.e_score_correction_bias]Loading weights:   9%|█████████▉                                                                                                       | 66/751 [00:12<02:50,  4.02it/s, Materializing param=model.layers.4.mlp.gate.e_score_correction_bias]Loading weights:   9%|███████████▌                                                                                                                      | 67/751 [00:12<02:50,  4.02it/s, Materializing param=model.layers.4.mlp.gate.weight]Loading weights:   9%|███████████▌                                                                                                                      | 67/751 [00:12<02:50,  4.02it/s, Materializing param=model.layers.4.mlp.gate.weight]Loading weights:   9%|█████████▉                                                                                                    | 68/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.mlp.shared_experts.down_proj.weight]Loading weights:   9%|█████████▉                                                                                                    | 68/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.mlp.shared_experts.down_proj.weight]Loading weights:   9%|██████████                                                                                                    | 69/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.mlp.shared_experts.gate_proj.weight]Loading weights:   9%|██████████                                                                                                    | 69/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.mlp.shared_experts.gate_proj.weight]Loading weights:   9%|██████████▍                                                                                                     | 70/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.mlp.shared_experts.up_proj.weight]Loading weights:   9%|██████████▍                                                                                                     | 70/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.mlp.shared_experts.up_proj.weight]Loading weights:   9%|██████████▊                                                                                                       | 71/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:   9%|██████████▊                                                                                                       | 71/751 [00:12<02:49,  4.02it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:  10%|██████████▉                                                                                                       | 72/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.kv_a_layernorm.weight]Loading weights:  10%|██████████▉                                                                                                       | 72/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.kv_a_layernorm.weight]Loading weights:  10%|██████████▋                                                                                                   | 73/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  10%|██████████▋                                                                                                   | 73/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  10%|███████████▋                                                                                                           | 74/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.kv_b_proj.weight]Loading weights:  10%|███████████▋                                                                                                           | 74/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.kv_b_proj.weight]Loading weights:  10%|████████████▏                                                                                                             | 75/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  10%|████████████▏                                                                                                             | 75/751 [00:12<02:48,  4.02it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  10%|███████████▋                                                                                                       | 76/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.4.self_attn.q_a_layernorm.weight]Loading weights:  10%|███████████▋                                                                                                       | 76/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.4.self_attn.q_a_layernorm.weight]Loading weights:  10%|████████████▎                                                                                                           | 77/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.4.self_attn.q_a_proj.weight]Loading weights:  10%|████████████▎                                                                                                           | 77/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.4.self_attn.q_a_proj.weight]Loading weights:  10%|████████████▍                                                                                                           | 78/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.4.self_attn.q_b_proj.weight]Loading weights:  10%|████████████▍                                                                                                           | 78/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.4.self_attn.q_b_proj.weight]Loading weights:  11%|████████████▉                                                                                                              | 79/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  11%|████████████▉                                                                                                              | 79/751 [00:12<02:47,  4.02it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  11%|█████████████▏                                                                                                              | 80/751 [00:12<02:46,  4.02it/s, Materializing param=model.layers.5.mlp.experts.down_proj]Loading weights:  11%|█████████████▏                                                                                                              | 80/751 [00:12<02:46,  4.02it/s, Materializing param=model.layers.5.mlp.experts.down_proj]Loading weights:  11%|█████████████▎                                                                                                              | 81/751 [00:13<01:44,  6.42it/s, Materializing param=model.layers.5.mlp.experts.down_proj]Loading weights:  11%|█████████████▎                                                                                                              | 81/751 [00:13<01:44,  6.41it/s, Materializing param=model.layers.5.mlp.experts.down_proj]Loading weights:  11%|█████████████                                                                                                            | 81/751 [00:13<01:44,  6.42it/s, Materializing param=model.layers.5.mlp.experts.gate_up_proj]Loading weights:  11%|█████████████                                                                                                            | 81/751 [00:13<01:44,  6.41it/s, Materializing param=model.layers.5.mlp.experts.gate_up_proj]Loading weights:  11%|█████████████                                                                                                            | 81/751 [00:13<01:44,  6.42it/s, Materializing param=model.layers.5.mlp.experts.gate_up_proj]Loading weights:  11%|█████████████                                                                                                            | 81/751 [00:13<01:44,  6.41it/s, Materializing param=model.layers.5.mlp.experts.gate_up_proj]Loading weights:  11%|█████████████▏                                                                                                           | 82/751 [00:15<02:45,  4.05it/s, Materializing param=model.layers.5.mlp.experts.gate_up_proj]Loading weights:  11%|████████████▎                                                                                                    | 82/751 [00:15<02:45,  4.05it/s, Materializing param=model.layers.5.mlp.gate.e_score_correction_bias]Loading weights:  11%|████████████▎                                                                                                    | 82/751 [00:15<02:45,  4.05it/s, Materializing param=model.layers.5.mlp.gate.e_score_correction_bias]Loading weights:  11%|██████████████▎                                                                                                                   | 83/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.gate.weight]Loading weights:  11%|██████████████▎                                                                                                                   | 83/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.gate.weight]Loading weights:  11%|████████████▎                                                                                                 | 84/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.shared_experts.down_proj.weight]Loading weights:  11%|████████████▎                                                                                                 | 84/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.shared_experts.down_proj.weight]Loading weights:  11%|████████████▍                                                                                                 | 85/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.shared_experts.gate_proj.weight]Loading weights:  11%|████████████▍                                                                                                 | 85/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.shared_experts.gate_proj.weight]Loading weights:  11%|████████████▊                                                                                                   | 86/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.shared_experts.up_proj.weight]Loading weights:  11%|████████████▊                                                                                                   | 86/751 [00:15<02:44,  4.05it/s, Materializing param=model.layers.5.mlp.shared_experts.up_proj.weight]Loading weights:  12%|█████████████▏                                                                                                    | 87/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  12%|█████████████▏                                                                                                    | 87/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  12%|█████████████▎                                                                                                    | 88/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.kv_a_layernorm.weight]Loading weights:  12%|█████████████▎                                                                                                    | 88/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.kv_a_layernorm.weight]Loading weights:  12%|█████████████                                                                                                 | 89/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  12%|█████████████                                                                                                 | 89/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  12%|██████████████▎                                                                                                        | 90/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.kv_b_proj.weight]Loading weights:  12%|██████████████▎                                                                                                        | 90/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.kv_b_proj.weight]Loading weights:  12%|██████████████▊                                                                                                           | 91/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  12%|██████████████▊                                                                                                           | 91/751 [00:15<02:43,  4.05it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  12%|██████████████                                                                                                     | 92/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.5.self_attn.q_a_layernorm.weight]Loading weights:  12%|██████████████                                                                                                     | 92/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.5.self_attn.q_a_layernorm.weight]Loading weights:  12%|██████████████▊                                                                                                         | 93/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.5.self_attn.q_a_proj.weight]Loading weights:  12%|██████████████▊                                                                                                         | 93/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.5.self_attn.q_a_proj.weight]Loading weights:  13%|███████████████                                                                                                         | 94/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.5.self_attn.q_b_proj.weight]Loading weights:  13%|███████████████                                                                                                         | 94/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.5.self_attn.q_b_proj.weight]Loading weights:  13%|███████████████▌                                                                                                           | 95/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  13%|███████████████▌                                                                                                           | 95/751 [00:15<02:42,  4.05it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  13%|███████████████▊                                                                                                            | 96/751 [00:15<02:41,  4.05it/s, Materializing param=model.layers.6.mlp.experts.down_proj]Loading weights:  13%|███████████████▊                                                                                                            | 96/751 [00:15<02:41,  4.05it/s, Materializing param=model.layers.6.mlp.experts.down_proj]Loading weights:  11%|█████████████▏                                                                                                           | 82/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.experts.gate_up_proj]Loading weights:  11%|████████████▎                                                                                                    | 82/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.gate.e_score_correction_bias]Loading weights:  11%|████████████▎                                                                                                    | 82/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.gate.e_score_correction_bias]Loading weights:  11%|██████████████▎                                                                                                                   | 83/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.gate.weight]Loading weights:  11%|██████████████▎                                                                                                                   | 83/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.gate.weight]Loading weights:  11%|████████████▎                                                                                                 | 84/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.shared_experts.down_proj.weight]Loading weights:  11%|████████████▎                                                                                                 | 84/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.shared_experts.down_proj.weight]Loading weights:  11%|████████████▍                                                                                                 | 85/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.shared_experts.gate_proj.weight]Loading weights:  11%|████████████▍                                                                                                 | 85/751 [00:15<02:44,  4.06it/s, Materializing param=model.layers.5.mlp.shared_experts.gate_proj.weight]Loading weights:  11%|████████████▊                                                                                                   | 86/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.mlp.shared_experts.up_proj.weight]Loading weights:  11%|████████████▊                                                                                                   | 86/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.mlp.shared_experts.up_proj.weight]Loading weights:  12%|█████████████▏                                                                                                    | 87/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  12%|█████████████▏                                                                                                    | 87/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  12%|█████████████▎                                                                                                    | 88/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.self_attn.kv_a_layernorm.weight]Loading weights:  12%|█████████████▎                                                                                                    | 88/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.self_attn.kv_a_layernorm.weight]Loading weights:  12%|█████████████                                                                                                 | 89/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  12%|█████████████                                                                                                 | 89/751 [00:15<02:43,  4.06it/s, Materializing param=model.layers.5.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  12%|██████████████▎                                                                                                        | 90/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.kv_b_proj.weight]Loading weights:  12%|██████████████▎                                                                                                        | 90/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.kv_b_proj.weight]Loading weights:  12%|██████████████▊                                                                                                           | 91/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  12%|██████████████▊                                                                                                           | 91/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  12%|██████████████                                                                                                     | 92/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.q_a_layernorm.weight]Loading weights:  12%|██████████████                                                                                                     | 92/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.q_a_layernorm.weight]Loading weights:  12%|██████████████▊                                                                                                         | 93/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.q_a_proj.weight]Loading weights:  12%|██████████████▊                                                                                                         | 93/751 [00:15<02:42,  4.06it/s, Materializing param=model.layers.5.self_attn.q_a_proj.weight]Loading weights:  13%|███████████████                                                                                                         | 94/751 [00:15<02:41,  4.06it/s, Materializing param=model.layers.5.self_attn.q_b_proj.weight]Loading weights:  13%|███████████████                                                                                                         | 94/751 [00:15<02:41,  4.06it/s, Materializing param=model.layers.5.self_attn.q_b_proj.weight]Loading weights:  13%|███████████████▌                                                                                                           | 95/751 [00:15<02:41,  4.06it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  13%|███████████████▌                                                                                                           | 95/751 [00:15<02:41,  4.06it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  13%|███████████████▊                                                                                                            | 96/751 [00:15<02:41,  4.06it/s, Materializing param=model.layers.6.mlp.experts.down_proj]Loading weights:  13%|███████████████▊                                                                                                            | 96/751 [00:15<02:41,  4.06it/s, Materializing param=model.layers.6.mlp.experts.down_proj]Loading weights:  13%|████████████████                                                                                                            | 97/751 [00:16<01:41,  6.42it/s, Materializing param=model.layers.6.mlp.experts.down_proj]Loading weights:  13%|████████████████                                                                                                            | 97/751 [00:16<01:42,  6.41it/s, Materializing param=model.layers.6.mlp.experts.down_proj]Loading weights:  13%|███████████████▋                                                                                                         | 97/751 [00:16<01:41,  6.42it/s, Materializing param=model.layers.6.mlp.experts.gate_up_proj]Loading weights:  13%|███████████████▋                                                                                                         | 97/751 [00:16<01:42,  6.41it/s, Materializing param=model.layers.6.mlp.experts.gate_up_proj]Loading weights:  13%|███████████████▋                                                                                                         | 97/751 [00:16<01:41,  6.42it/s, Materializing param=model.layers.6.mlp.experts.gate_up_proj]Loading weights:  13%|███████████████▋                                                                                                         | 97/751 [00:16<01:42,  6.41it/s, Materializing param=model.layers.6.mlp.experts.gate_up_proj]Loading weights:  13%|███████████████▊                                                                                                         | 98/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.experts.gate_up_proj]Loading weights:  13%|██████████████▋                                                                                                  | 98/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.e_score_correction_bias]Loading weights:  13%|██████████████▋                                                                                                  | 98/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.e_score_correction_bias]Loading weights:  13%|█████████████████▏                                                                                                                | 99/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.weight]Loading weights:  13%|█████████████████▏                                                                                                                | 99/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.weight]Loading weights:  13%|██████████████▌                                                                                              | 100/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.down_proj.weight]Loading weights:  13%|██████████████▌                                                                                              | 100/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.down_proj.weight]Loading weights:  13%|██████████████▋                                                                                              | 101/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.gate_proj.weight]Loading weights:  13%|██████████████▋                                                                                              | 101/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.gate_proj.weight]Loading weights:  14%|███████████████                                                                                                | 102/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.up_proj.weight]Loading weights:  14%|███████████████                                                                                                | 102/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.up_proj.weight]Loading weights:  14%|███████████████▍                                                                                                 | 103/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  14%|███████████████▍                                                                                                 | 103/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  14%|███████████████▋                                                                                                 | 104/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_layernorm.weight]Loading weights:  14%|███████████████▋                                                                                                 | 104/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_layernorm.weight]Loading weights:  14%|███████████████▏                                                                                             | 105/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  14%|███████████████▏                                                                                             | 105/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  14%|████████████████▋                                                                                                     | 106/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_b_proj.weight]Loading weights:  14%|████████████████▋                                                                                                     | 106/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_b_proj.weight]Loading weights:  14%|█████████████████▏                                                                                                       | 107/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  14%|█████████████████▏                                                                                                       | 107/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  14%|████████████████▍                                                                                                 | 108/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_layernorm.weight]Loading weights:  14%|████████████████▍                                                                                                 | 108/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_layernorm.weight]Loading weights:  15%|█████████████████▎                                                                                                     | 109/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_proj.weight]Loading weights:  15%|█████████████████▎                                                                                                     | 109/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_proj.weight]Loading weights:  15%|█████████████████▍                                                                                                     | 110/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_b_proj.weight]Loading weights:  15%|█████████████████▍                                                                                                     | 110/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_b_proj.weight]Loading weights:  15%|██████████████████                                                                                                        | 111/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  15%|██████████████████                                                                                                        | 111/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  15%|██████████████████▎                                                                                                        | 112/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.mlp.experts.down_proj]Loading weights:  15%|██████████████████▎                                                                                                        | 112/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.mlp.experts.down_proj]Loading weights:  13%|███████████████▊                                                                                                         | 98/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.experts.gate_up_proj]Loading weights:  13%|██████████████▋                                                                                                  | 98/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.e_score_correction_bias]Loading weights:  13%|██████████████▋                                                                                                  | 98/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.e_score_correction_bias]Loading weights:  13%|█████████████████▏                                                                                                                | 99/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.weight]Loading weights:  13%|█████████████████▏                                                                                                                | 99/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.gate.weight]Loading weights:  13%|██████████████▌                                                                                              | 100/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.down_proj.weight]Loading weights:  13%|██████████████▌                                                                                              | 100/751 [00:18<02:39,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.down_proj.weight]Loading weights:  13%|██████████████▋                                                                                              | 101/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.gate_proj.weight]Loading weights:  13%|██████████████▋                                                                                              | 101/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.gate_proj.weight]Loading weights:  14%|███████████████                                                                                                | 102/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.up_proj.weight]Loading weights:  14%|███████████████                                                                                                | 102/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.mlp.shared_experts.up_proj.weight]Loading weights:  14%|███████████████▍                                                                                                 | 103/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  14%|███████████████▍                                                                                                 | 103/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  14%|███████████████▋                                                                                                 | 104/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_layernorm.weight]Loading weights:  14%|███████████████▋                                                                                                 | 104/751 [00:18<02:38,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_layernorm.weight]Loading weights:  14%|███████████████▏                                                                                             | 105/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  14%|███████████████▏                                                                                             | 105/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  14%|████████████████▋                                                                                                     | 106/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_b_proj.weight]Loading weights:  14%|████████████████▋                                                                                                     | 106/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.kv_b_proj.weight]Loading weights:  14%|█████████████████▏                                                                                                       | 107/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  14%|█████████████████▏                                                                                                       | 107/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  14%|████████████████▍                                                                                                 | 108/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_layernorm.weight]Loading weights:  14%|████████████████▍                                                                                                 | 108/751 [00:18<02:37,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_layernorm.weight]Loading weights:  15%|█████████████████▎                                                                                                     | 109/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_proj.weight]Loading weights:  15%|█████████████████▎                                                                                                     | 109/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_a_proj.weight]Loading weights:  15%|█████████████████▍                                                                                                     | 110/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_b_proj.weight]Loading weights:  15%|█████████████████▍                                                                                                     | 110/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.6.self_attn.q_b_proj.weight]Loading weights:  15%|██████████████████                                                                                                        | 111/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  15%|██████████████████                                                                                                        | 111/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  15%|██████████████████▎                                                                                                        | 112/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.mlp.experts.down_proj]Loading weights:  15%|██████████████████▎                                                                                                        | 112/751 [00:18<02:36,  4.09it/s, Materializing param=model.layers.7.mlp.experts.down_proj]Loading weights:  15%|██████████████████▌                                                                                                        | 113/751 [00:19<01:38,  6.46it/s, Materializing param=model.layers.7.mlp.experts.down_proj]Loading weights:  15%|██████████████████▌                                                                                                        | 113/751 [00:19<01:38,  6.47it/s, Materializing param=model.layers.7.mlp.experts.down_proj]Loading weights:  15%|██████████████████                                                                                                      | 113/751 [00:19<01:38,  6.46it/s, Materializing param=model.layers.7.mlp.experts.gate_up_proj]Loading weights:  15%|██████████████████                                                                                                      | 113/751 [00:19<01:38,  6.47it/s, Materializing param=model.layers.7.mlp.experts.gate_up_proj]Loading weights:  15%|██████████████████                                                                                                      | 113/751 [00:19<01:38,  6.46it/s, Materializing param=model.layers.7.mlp.experts.gate_up_proj]Loading weights:  15%|██████████████████                                                                                                      | 113/751 [00:19<01:38,  6.47it/s, Materializing param=model.layers.7.mlp.experts.gate_up_proj]Loading weights:  15%|██████████████████▏                                                                                                     | 114/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.experts.gate_up_proj]Loading weights:  15%|█████████████████                                                                                               | 114/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.gate.e_score_correction_bias]Loading weights:  15%|█████████████████                                                                                               | 114/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.gate.e_score_correction_bias]Loading weights:  15%|███████████████████▊                                                                                                             | 115/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.gate.weight]Loading weights:  15%|███████████████████▊                                                                                                             | 115/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.gate.weight]Loading weights:  15%|████████████████▊                                                                                            | 116/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.shared_experts.down_proj.weight]Loading weights:  15%|████████████████▊                                                                                            | 116/751 [00:21<02:33,  4.15it/s, Materializing param=model.layers.7.mlp.shared_experts.down_proj.weight]Loading weights:  16%|████████████████▉                                                                                            | 117/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.mlp.shared_experts.gate_proj.weight]Loading weights:  16%|████████████████▉                                                                                            | 117/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.mlp.shared_experts.gate_proj.weight]Loading weights:  16%|█████████████████▍                                                                                             | 118/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.mlp.shared_experts.up_proj.weight]Loading weights:  16%|█████████████████▍                                                                                             | 118/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.mlp.shared_experts.up_proj.weight]Loading weights:  16%|█████████████████▉                                                                                               | 119/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  16%|█████████████████▉                                                                                               | 119/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  16%|██████████████████                                                                                               | 120/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.self_attn.kv_a_layernorm.weight]Loading weights:  16%|██████████████████                                                                                               | 120/751 [00:21<02:32,  4.15it/s, Materializing param=model.layers.7.self_attn.kv_a_layernorm.weight]Loading weights:  16%|█████████████████▌                                                                                           | 121/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  16%|█████████████████▌                                                                                           | 121/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  16%|███████████████████▏                                                                                                  | 122/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.kv_b_proj.weight]Loading weights:  16%|███████████████████▏                                                                                                  | 122/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.kv_b_proj.weight]Loading weights:  16%|███████████████████▊                                                                                                     | 123/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  16%|███████████████████▊                                                                                                     | 123/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  17%|██████████████████▊                                                                                               | 124/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.q_a_layernorm.weight]Loading weights:  17%|██████████████████▊                                                                                               | 124/751 [00:21<02:31,  4.15it/s, Materializing param=model.layers.7.self_attn.q_a_layernorm.weight]Loading weights:  17%|███████████████████▊                                                                                                   | 125/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.7.self_attn.q_a_proj.weight]Loading weights:  17%|███████████████████▊                                                                                                   | 125/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.7.self_attn.q_a_proj.weight]Loading weights:  17%|███████████████████▉                                                                                                   | 126/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.7.self_attn.q_b_proj.weight]Loading weights:  17%|███████████████████▉                                                                                                   | 126/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.7.self_attn.q_b_proj.weight]Loading weights:  17%|████████████████████▋                                                                                                     | 127/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  17%|████████████████████▋                                                                                                     | 127/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  17%|████████████████████▉                                                                                                      | 128/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.8.mlp.experts.down_proj]Loading weights:  17%|████████████████████▉                                                                                                      | 128/751 [00:21<02:30,  4.15it/s, Materializing param=model.layers.8.mlp.experts.down_proj]Loading weights:  15%|██████████████████▏                                                                                                     | 114/751 [00:21<02:34,  4.13it/s, Materializing param=model.layers.7.mlp.experts.gate_up_proj]Loading weights:  15%|█████████████████                                                                                               | 114/751 [00:21<02:34,  4.13it/s, Materializing param=model.layers.7.mlp.gate.e_score_correction_bias]Loading weights:  15%|█████████████████                                                                                               | 114/751 [00:21<02:34,  4.13it/s, Materializing param=model.layers.7.mlp.gate.e_score_correction_bias]Loading weights:  15%|███████████████████▊                                                                                                             | 115/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.gate.weight]Loading weights:  15%|███████████████████▊                                                                                                             | 115/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.gate.weight]Loading weights:  15%|████████████████▊                                                                                            | 116/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.shared_experts.down_proj.weight]Loading weights:  15%|████████████████▊                                                                                            | 116/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.shared_experts.down_proj.weight]Loading weights:  16%|████████████████▉                                                                                            | 117/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.shared_experts.gate_proj.weight]Loading weights:  16%|████████████████▉                                                                                            | 117/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.shared_experts.gate_proj.weight]Loading weights:  16%|█████████████████▍                                                                                             | 118/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.shared_experts.up_proj.weight]Loading weights:  16%|█████████████████▍                                                                                             | 118/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.mlp.shared_experts.up_proj.weight]Loading weights:  16%|█████████████████▉                                                                                               | 119/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  16%|█████████████████▉                                                                                               | 119/751 [00:21<02:33,  4.13it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  16%|██████████████████                                                                                               | 120/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.kv_a_layernorm.weight]Loading weights:  16%|██████████████████                                                                                               | 120/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.kv_a_layernorm.weight]Loading weights:  16%|█████████████████▌                                                                                           | 121/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  16%|█████████████████▌                                                                                           | 121/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  16%|███████████████████▏                                                                                                  | 122/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.kv_b_proj.weight]Loading weights:  16%|███████████████████▏                                                                                                  | 122/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.kv_b_proj.weight]Loading weights:  16%|███████████████████▊                                                                                                     | 123/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  16%|███████████████████▊                                                                                                     | 123/751 [00:21<02:32,  4.13it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  17%|██████████████████▊                                                                                               | 124/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.7.self_attn.q_a_layernorm.weight]Loading weights:  17%|██████████████████▊                                                                                               | 124/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.7.self_attn.q_a_layernorm.weight]Loading weights:  17%|███████████████████▊                                                                                                   | 125/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.7.self_attn.q_a_proj.weight]Loading weights:  17%|███████████████████▊                                                                                                   | 125/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.7.self_attn.q_a_proj.weight]Loading weights:  17%|███████████████████▉                                                                                                   | 126/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.7.self_attn.q_b_proj.weight]Loading weights:  17%|███████████████████▉                                                                                                   | 126/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.7.self_attn.q_b_proj.weight]Loading weights:  17%|████████████████████▋                                                                                                     | 127/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  17%|████████████████████▋                                                                                                     | 127/751 [00:21<02:31,  4.13it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  17%|████████████████████▉                                                                                                      | 128/751 [00:21<02:30,  4.13it/s, Materializing param=model.layers.8.mlp.experts.down_proj]Loading weights:  17%|████████████████████▉                                                                                                      | 128/751 [00:21<02:30,  4.13it/s, Materializing param=model.layers.8.mlp.experts.down_proj]Loading weights:  17%|█████████████████████▏                                                                                                     | 129/751 [00:22<01:35,  6.52it/s, Materializing param=model.layers.8.mlp.experts.down_proj]Loading weights:  17%|█████████████████████▏                                                                                                     | 129/751 [00:22<01:35,  6.54it/s, Materializing param=model.layers.8.mlp.experts.down_proj]Loading weights:  17%|████████████████████▌                                                                                                   | 129/751 [00:22<01:35,  6.52it/s, Materializing param=model.layers.8.mlp.experts.gate_up_proj]Loading weights:  17%|████████████████████▌                                                                                                   | 129/751 [00:22<01:35,  6.54it/s, Materializing param=model.layers.8.mlp.experts.gate_up_proj]Loading weights:  17%|████████████████████▌                                                                                                   | 129/751 [00:22<01:35,  6.52it/s, Materializing param=model.layers.8.mlp.experts.gate_up_proj]Loading weights:  17%|████████████████████▌                                                                                                   | 129/751 [00:22<01:35,  6.54it/s, Materializing param=model.layers.8.mlp.experts.gate_up_proj]Loading weights:  17%|████████████████████▊                                                                                                   | 130/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.experts.gate_up_proj]Loading weights:  17%|███████████████████▍                                                                                            | 130/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.gate.e_score_correction_bias]Loading weights:  17%|███████████████████▍                                                                                            | 130/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.gate.e_score_correction_bias]Loading weights:  17%|██████████████████████▌                                                                                                          | 131/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.gate.weight]Loading weights:  17%|██████████████████████▌                                                                                                          | 131/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.gate.weight]Loading weights:  18%|███████████████████▏                                                                                         | 132/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.shared_experts.down_proj.weight]Loading weights:  18%|███████████████████▏                                                                                         | 132/751 [00:24<02:27,  4.21it/s, Materializing param=model.layers.8.mlp.shared_experts.down_proj.weight]Loading weights:  18%|███████████████████▎                                                                                         | 133/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.mlp.shared_experts.gate_proj.weight]Loading weights:  18%|███████████████████▎                                                                                         | 133/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.mlp.shared_experts.gate_proj.weight]Loading weights:  18%|███████████████████▊                                                                                           | 134/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.mlp.shared_experts.up_proj.weight]Loading weights:  18%|███████████████████▊                                                                                           | 134/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.mlp.shared_experts.up_proj.weight]Loading weights:  18%|████████████████████▎                                                                                            | 135/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  18%|████████████████████▎                                                                                            | 135/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  18%|████████████████████▍                                                                                            | 136/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.self_attn.kv_a_layernorm.weight]Loading weights:  18%|████████████████████▍                                                                                            | 136/751 [00:24<02:26,  4.21it/s, Materializing param=model.layers.8.self_attn.kv_a_layernorm.weight]Loading weights:  18%|███████████████████▉                                                                                         | 137/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  18%|███████████████████▉                                                                                         | 137/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  18%|█████████████████████▋                                                                                                | 138/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.kv_b_proj.weight]Loading weights:  18%|█████████████████████▋                                                                                                | 138/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.kv_b_proj.weight]Loading weights:  19%|██████████████████████▍                                                                                                  | 139/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  19%|██████████████████████▍                                                                                                  | 139/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  19%|█████████████████████▎                                                                                            | 140/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.q_a_layernorm.weight]Loading weights:  19%|█████████████████████▎                                                                                            | 140/751 [00:24<02:25,  4.21it/s, Materializing param=model.layers.8.self_attn.q_a_layernorm.weight]Loading weights:  19%|██████████████████████▎                                                                                                | 141/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.8.self_attn.q_a_proj.weight]Loading weights:  19%|██████████████████████▎                                                                                                | 141/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.8.self_attn.q_a_proj.weight]Loading weights:  19%|██████████████████████▌                                                                                                | 142/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.8.self_attn.q_b_proj.weight]Loading weights:  19%|██████████████████████▌                                                                                                | 142/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.8.self_attn.q_b_proj.weight]Loading weights:  19%|███████████████████████▏                                                                                                  | 143/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  19%|███████████████████████▏                                                                                                  | 143/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  19%|███████████████████████▌                                                                                                   | 144/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.9.mlp.experts.down_proj]Loading weights:  19%|███████████████████████▌                                                                                                   | 144/751 [00:24<02:24,  4.21it/s, Materializing param=model.layers.9.mlp.experts.down_proj]Loading weights:  17%|████████████████████▊                                                                                                   | 130/751 [00:24<02:28,  4.19it/s, Materializing param=model.layers.8.mlp.experts.gate_up_proj]Loading weights:  17%|███████████████████▍                                                                                            | 130/751 [00:24<02:28,  4.19it/s, Materializing param=model.layers.8.mlp.gate.e_score_correction_bias]Loading weights:  17%|███████████████████▍                                                                                            | 130/751 [00:24<02:28,  4.19it/s, Materializing param=model.layers.8.mlp.gate.e_score_correction_bias]Loading weights:  17%|██████████████████████▌                                                                                                          | 131/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.gate.weight]Loading weights:  17%|██████████████████████▌                                                                                                          | 131/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.gate.weight]Loading weights:  18%|███████████████████▏                                                                                         | 132/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.shared_experts.down_proj.weight]Loading weights:  18%|███████████████████▏                                                                                         | 132/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.shared_experts.down_proj.weight]Loading weights:  18%|███████████████████▎                                                                                         | 133/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.shared_experts.gate_proj.weight]Loading weights:  18%|███████████████████▎                                                                                         | 133/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.shared_experts.gate_proj.weight]Loading weights:  18%|███████████████████▊                                                                                           | 134/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.shared_experts.up_proj.weight]Loading weights:  18%|███████████████████▊                                                                                           | 134/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.mlp.shared_experts.up_proj.weight]Loading weights:  18%|████████████████████▎                                                                                            | 135/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  18%|████████████████████▎                                                                                            | 135/751 [00:24<02:27,  4.19it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  18%|████████████████████▍                                                                                            | 136/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.kv_a_layernorm.weight]Loading weights:  18%|████████████████████▍                                                                                            | 136/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.kv_a_layernorm.weight]Loading weights:  18%|███████████████████▉                                                                                         | 137/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  18%|███████████████████▉                                                                                         | 137/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  18%|█████████████████████▋                                                                                                | 138/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.kv_b_proj.weight]Loading weights:  18%|█████████████████████▋                                                                                                | 138/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.kv_b_proj.weight]Loading weights:  19%|██████████████████████▍                                                                                                  | 139/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  19%|██████████████████████▍                                                                                                  | 139/751 [00:24<02:26,  4.19it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  19%|█████████████████████▎                                                                                            | 140/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.8.self_attn.q_a_layernorm.weight]Loading weights:  19%|█████████████████████▎                                                                                            | 140/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.8.self_attn.q_a_layernorm.weight]Loading weights:  19%|██████████████████████▎                                                                                                | 141/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.8.self_attn.q_a_proj.weight]Loading weights:  19%|██████████████████████▎                                                                                                | 141/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.8.self_attn.q_a_proj.weight]Loading weights:  19%|██████████████████████▌                                                                                                | 142/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.8.self_attn.q_b_proj.weight]Loading weights:  19%|██████████████████████▌                                                                                                | 142/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.8.self_attn.q_b_proj.weight]Loading weights:  19%|███████████████████████▏                                                                                                  | 143/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  19%|███████████████████████▏                                                                                                  | 143/751 [00:24<02:25,  4.19it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  19%|███████████████████████▌                                                                                                   | 144/751 [00:24<02:24,  4.19it/s, Materializing param=model.layers.9.mlp.experts.down_proj]Loading weights:  19%|███████████████████████▌                                                                                                   | 144/751 [00:24<02:24,  4.19it/s, Materializing param=model.layers.9.mlp.experts.down_proj]Loading weights:  19%|███████████████████████▋                                                                                                   | 145/751 [00:25<01:31,  6.63it/s, Materializing param=model.layers.9.mlp.experts.down_proj]Loading weights:  19%|███████████████████████▋                                                                                                   | 145/751 [00:25<01:31,  6.64it/s, Materializing param=model.layers.9.mlp.experts.down_proj]Loading weights:  19%|███████████████████████▏                                                                                                | 145/751 [00:25<01:31,  6.63it/s, Materializing param=model.layers.9.mlp.experts.gate_up_proj]Loading weights:  19%|███████████████████████▏                                                                                                | 145/751 [00:25<01:31,  6.64it/s, Materializing param=model.layers.9.mlp.experts.gate_up_proj]Loading weights:  19%|███████████████████████▏                                                                                                | 145/751 [00:25<01:31,  6.63it/s, Materializing param=model.layers.9.mlp.experts.gate_up_proj]Loading weights:  19%|███████████████████████▏                                                                                                | 145/751 [00:25<01:31,  6.64it/s, Materializing param=model.layers.9.mlp.experts.gate_up_proj]Loading weights:  19%|███████████████████████▎                                                                                                | 146/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.experts.gate_up_proj]Loading weights:  19%|█████████████████████▊                                                                                          | 146/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.e_score_correction_bias]Loading weights:  19%|█████████████████████▊                                                                                          | 146/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.e_score_correction_bias]Loading weights:  20%|█████████████████████████▎                                                                                                       | 147/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.weight]Loading weights:  20%|█████████████████████████▎                                                                                                       | 147/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.weight]Loading weights:  20%|█████████████████████▍                                                                                       | 148/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.down_proj.weight]Loading weights:  20%|█████████████████████▍                                                                                       | 148/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.down_proj.weight]Loading weights:  20%|█████████████████████▋                                                                                       | 149/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.gate_proj.weight]Loading weights:  20%|█████████████████████▋                                                                                       | 149/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.gate_proj.weight]Loading weights:  20%|██████████████████████▏                                                                                        | 150/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.up_proj.weight]Loading weights:  20%|██████████████████████▏                                                                                        | 150/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.up_proj.weight]Loading weights:  20%|██████████████████████▋                                                                                          | 151/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  20%|██████████████████████▋                                                                                          | 151/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  20%|██████████████████████▊                                                                                          | 152/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_layernorm.weight]Loading weights:  20%|██████████████████████▊                                                                                          | 152/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_layernorm.weight]Loading weights:  20%|██████████████████████▏                                                                                      | 153/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  20%|██████████████████████▏                                                                                      | 153/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  21%|████████████████████████▏                                                                                             | 154/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_b_proj.weight]Loading weights:  21%|████████████████████████▏                                                                                             | 154/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_b_proj.weight]Loading weights:  19%|███████████████████████▎                                                                                                | 146/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.experts.gate_up_proj]Loading weights:  21%|████████████████████████▉                                                                                                | 155/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  19%|█████████████████████▊                                                                                          | 146/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.e_score_correction_bias]Loading weights:  21%|████████████████████████▉                                                                                                | 155/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  19%|█████████████████████▊                                                                                          | 146/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.e_score_correction_bias]Loading weights:  21%|███████████████████████▋                                                                                          | 156/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_layernorm.weight]Loading weights:  21%|███████████████████████▋                                                                                          | 156/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_layernorm.weight]Loading weights:  20%|█████████████████████████▎                                                                                                       | 147/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.weight]Loading weights:  21%|████████████████████████▉                                                                                              | 157/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_proj.weight]Loading weights:  20%|█████████████████████████▎                                                                                                       | 147/751 [00:27<02:29,  4.05it/s, Materializing param=model.layers.9.mlp.gate.weight]Loading weights:  21%|████████████████████████▉                                                                                              | 157/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_proj.weight]Loading weights:  20%|█████████████████████▍                                                                                       | 148/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.down_proj.weight]Loading weights:  21%|█████████████████████████                                                                                              | 158/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_b_proj.weight]Loading weights:  21%|█████████████████████████                                                                                              | 158/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_b_proj.weight]Loading weights:  20%|█████████████████████▍                                                                                       | 148/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.down_proj.weight]Loading weights:  21%|█████████████████████████▌                                                                                               | 159/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  21%|█████████████████████████▌                                                                                               | 159/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  20%|█████████████████████▋                                                                                       | 149/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.gate_proj.weight]Loading weights:  20%|█████████████████████▋                                                                                       | 149/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.gate_proj.weight]Loading weights:  21%|█████████████████████████▉                                                                                                | 160/751 [00:27<02:25,  4.05it/s, Materializing param=model.layers.10.mlp.experts.down_proj]Loading weights:  21%|█████████████████████████▉                                                                                                | 160/751 [00:27<02:25,  4.05it/s, Materializing param=model.layers.10.mlp.experts.down_proj]Loading weights:  20%|██████████████████████▏                                                                                        | 150/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.up_proj.weight]Loading weights:  20%|██████████████████████▏                                                                                        | 150/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.mlp.shared_experts.up_proj.weight]Loading weights:  20%|██████████████████████▋                                                                                          | 151/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  20%|██████████████████████▋                                                                                          | 151/751 [00:27<02:28,  4.05it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  20%|██████████████████████▊                                                                                          | 152/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_layernorm.weight]Loading weights:  20%|██████████████████████▊                                                                                          | 152/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_layernorm.weight]Loading weights:  20%|██████████████████████▏                                                                                      | 153/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  20%|██████████████████████▏                                                                                      | 153/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  21%|████████████████████████▏                                                                                             | 154/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_b_proj.weight]Loading weights:  21%|████████████████████████▏                                                                                             | 154/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.kv_b_proj.weight]Loading weights:  21%|████████████████████████▉                                                                                                | 155/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  21%|████████████████████████▉                                                                                                | 155/751 [00:27<02:27,  4.05it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  21%|███████████████████████▋                                                                                          | 156/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_layernorm.weight]Loading weights:  21%|███████████████████████▋                                                                                          | 156/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_layernorm.weight]Loading weights:  21%|████████████████████████▉                                                                                              | 157/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_proj.weight]Loading weights:  21%|████████████████████████▉                                                                                              | 157/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_a_proj.weight]Loading weights:  21%|█████████████████████████                                                                                              | 158/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_b_proj.weight]Loading weights:  21%|█████████████████████████                                                                                              | 158/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.9.self_attn.q_b_proj.weight]Loading weights:  21%|█████████████████████████▌                                                                                               | 159/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  21%|█████████████████████████▌                                                                                               | 159/751 [00:27<02:26,  4.05it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  21%|█████████████████████████▉                                                                                                | 160/751 [00:27<02:25,  4.05it/s, Materializing param=model.layers.10.mlp.experts.down_proj]Loading weights:  21%|█████████████████████████▉                                                                                                | 160/751 [00:27<02:25,  4.05it/s, Materializing param=model.layers.10.mlp.experts.down_proj]Loading weights:  21%|██████████████████████████▏                                                                                               | 161/751 [00:28<01:33,  6.31it/s, Materializing param=model.layers.10.mlp.experts.down_proj]Loading weights:  21%|██████████████████████████▏                                                                                               | 161/751 [00:28<01:33,  6.31it/s, Materializing param=model.layers.10.mlp.experts.down_proj]Loading weights:  21%|█████████████████████████▌                                                                                             | 161/751 [00:28<01:33,  6.31it/s, Materializing param=model.layers.10.mlp.experts.gate_up_proj]Loading weights:  21%|█████████████████████████▌                                                                                             | 161/751 [00:28<01:33,  6.31it/s, Materializing param=model.layers.10.mlp.experts.gate_up_proj]Loading weights:  21%|█████████████████████████▌                                                                                             | 161/751 [00:28<01:33,  6.31it/s, Materializing param=model.layers.10.mlp.experts.gate_up_proj]Loading weights:  21%|█████████████████████████▌                                                                                             | 161/751 [00:28<01:33,  6.31it/s, Materializing param=model.layers.10.mlp.experts.gate_up_proj]Loading weights:  22%|█████████████████████████▋                                                                                             | 162/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.experts.gate_up_proj]Loading weights:  22%|███████████████████████▉                                                                                       | 162/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.e_score_correction_bias]Loading weights:  22%|███████████████████████▉                                                                                       | 162/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.e_score_correction_bias]Loading weights:  22%|███████████████████████████▊                                                                                                    | 163/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.weight]Loading weights:  22%|███████████████████████████▊                                                                                                    | 163/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.weight]Loading weights:  22%|███████████████████████▌                                                                                    | 164/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.down_proj.weight]Loading weights:  22%|███████████████████████▌                                                                                    | 164/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.down_proj.weight]Loading weights:  22%|███████████████████████▋                                                                                    | 165/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.gate_proj.weight]Loading weights:  22%|███████████████████████▋                                                                                    | 165/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.gate_proj.weight]Loading weights:  22%|████████████████████████▎                                                                                     | 166/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.up_proj.weight]Loading weights:  22%|████████████████████████▎                                                                                     | 166/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.up_proj.weight]Loading weights:  22%|████████████████████████▉                                                                                       | 167/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  22%|█████████████████████████▋                                                                                             | 162/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.experts.gate_up_proj]Loading weights:  22%|████████████████████████▉                                                                                       | 167/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  22%|███████████████████████▉                                                                                       | 162/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.e_score_correction_bias]Loading weights:  22%|█████████████████████████                                                                                       | 168/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_layernorm.weight]Loading weights:  22%|███████████████████████▉                                                                                       | 162/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.e_score_correction_bias]Loading weights:  22%|█████████████████████████                                                                                       | 168/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_layernorm.weight]Loading weights:  22%|███████████████████████████▊                                                                                                    | 163/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.weight]Loading weights:  23%|████████████████████████▎                                                                                   | 169/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  22%|███████████████████████████▊                                                                                                    | 163/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.gate.weight]Loading weights:  23%|████████████████████████▎                                                                                   | 169/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  22%|███████████████████████▌                                                                                    | 164/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.down_proj.weight]Loading weights:  23%|██████████████████████████▍                                                                                          | 170/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_b_proj.weight]Loading weights:  22%|███████████████████████▌                                                                                    | 164/751 [00:30<02:25,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.down_proj.weight]Loading weights:  23%|██████████████████████████▍                                                                                          | 170/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_b_proj.weight]Loading weights:  22%|███████████████████████▋                                                                                    | 165/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.gate_proj.weight]Loading weights:  23%|███████████████████████████▎                                                                                            | 171/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  22%|███████████████████████▋                                                                                    | 165/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.gate_proj.weight]Loading weights:  23%|███████████████████████████▎                                                                                            | 171/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  22%|████████████████████████▎                                                                                     | 166/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.up_proj.weight]Loading weights:  23%|█████████████████████████▉                                                                                       | 172/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_layernorm.weight]Loading weights:  23%|█████████████████████████▉                                                                                       | 172/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_layernorm.weight]Loading weights:  22%|████████████████████████▎                                                                                     | 166/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.mlp.shared_experts.up_proj.weight]Loading weights:  23%|███████████████████████████▏                                                                                          | 173/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_proj.weight]Loading weights:  22%|████████████████████████▉                                                                                       | 167/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  23%|███████████████████████████▏                                                                                          | 173/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_proj.weight]Loading weights:  22%|████████████████████████▉                                                                                       | 167/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  23%|███████████████████████████▎                                                                                          | 174/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_b_proj.weight]Loading weights:  22%|█████████████████████████                                                                                       | 168/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_layernorm.weight]Loading weights:  23%|███████████████████████████▎                                                                                          | 174/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_b_proj.weight]Loading weights:  22%|█████████████████████████                                                                                       | 168/751 [00:30<02:24,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_layernorm.weight]Loading weights:  23%|████████████████████████████▏                                                                                            | 175/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  23%|████████████████████████████▏                                                                                            | 175/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  23%|████████████████████████▎                                                                                   | 169/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  23%|████████████████████████████▌                                                                                             | 176/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.mlp.experts.down_proj]Loading weights:  23%|████████████████████████▎                                                                                   | 169/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  23%|████████████████████████████▌                                                                                             | 176/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.mlp.experts.down_proj]Loading weights:  23%|██████████████████████████▍                                                                                          | 170/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_b_proj.weight]Loading weights:  23%|██████████████████████████▍                                                                                          | 170/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.kv_b_proj.weight]Loading weights:  23%|███████████████████████████▎                                                                                            | 171/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  23%|███████████████████████████▎                                                                                            | 171/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  23%|█████████████████████████▉                                                                                       | 172/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_layernorm.weight]Loading weights:  23%|█████████████████████████▉                                                                                       | 172/751 [00:30<02:23,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_layernorm.weight]Loading weights:  23%|███████████████████████████▏                                                                                          | 173/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_proj.weight]Loading weights:  23%|███████████████████████████▏                                                                                          | 173/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_a_proj.weight]Loading weights:  23%|███████████████████████████▎                                                                                          | 174/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_b_proj.weight]Loading weights:  23%|███████████████████████████▎                                                                                          | 174/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.10.self_attn.q_b_proj.weight]Loading weights:  23%|████████████████████████████▏                                                                                            | 175/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  23%|████████████████████████████▏                                                                                            | 175/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  23%|████████████████████████████▌                                                                                             | 176/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.mlp.experts.down_proj]Loading weights:  23%|████████████████████████████▌                                                                                             | 176/751 [00:30<02:22,  4.04it/s, Materializing param=model.layers.11.mlp.experts.down_proj]Loading weights:  24%|████████████████████████████▊                                                                                             | 177/751 [00:31<01:29,  6.42it/s, Materializing param=model.layers.11.mlp.experts.down_proj]Loading weights:  24%|████████████████████████████▊                                                                                             | 177/751 [00:31<01:29,  6.42it/s, Materializing param=model.layers.11.mlp.experts.down_proj]Loading weights:  24%|████████████████████████████                                                                                           | 177/751 [00:31<01:29,  6.42it/s, Materializing param=model.layers.11.mlp.experts.gate_up_proj]Loading weights:  24%|████████████████████████████                                                                                           | 177/751 [00:31<01:29,  6.42it/s, Materializing param=model.layers.11.mlp.experts.gate_up_proj]Loading weights:  24%|████████████████████████████                                                                                           | 177/751 [00:31<01:29,  6.42it/s, Materializing param=model.layers.11.mlp.experts.gate_up_proj]Loading weights:  24%|████████████████████████████                                                                                           | 177/751 [00:31<01:29,  6.42it/s, Materializing param=model.layers.11.mlp.experts.gate_up_proj]Loading weights:  24%|████████████████████████████▏                                                                                          | 178/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.experts.gate_up_proj]Loading weights:  24%|██████████████████████████▎                                                                                    | 178/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.e_score_correction_bias]Loading weights:  24%|██████████████████████████▎                                                                                    | 178/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.e_score_correction_bias]Loading weights:  24%|██████████████████████████████▌                                                                                                 | 179/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.weight]Loading weights:  24%|██████████████████████████████▌                                                                                                 | 179/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.weight]Loading weights:  24%|█████████████████████████▉                                                                                  | 180/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.down_proj.weight]Loading weights:  24%|█████████████████████████▉                                                                                  | 180/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.down_proj.weight]Loading weights:  24%|██████████████████████████                                                                                  | 181/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.gate_proj.weight]Loading weights:  24%|██████████████████████████                                                                                  | 181/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.gate_proj.weight]Loading weights:  24%|██████████████████████████▋                                                                                   | 182/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.up_proj.weight]Loading weights:  24%|██████████████████████████▋                                                                                   | 182/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.up_proj.weight]Loading weights:  24%|████████████████████████████▏                                                                                          | 178/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.experts.gate_up_proj]Loading weights:  24%|███████████████████████████▎                                                                                    | 183/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  24%|██████████████████████████▎                                                                                    | 178/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.e_score_correction_bias]Loading weights:  24%|███████████████████████████▎                                                                                    | 183/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  24%|██████████████████████████▎                                                                                    | 178/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.e_score_correction_bias]Loading weights:  25%|███████████████████████████▍                                                                                    | 184/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_layernorm.weight]Loading weights:  25%|███████████████████████████▍                                                                                    | 184/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_layernorm.weight]Loading weights:  24%|██████████████████████████████▌                                                                                                 | 179/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.weight]Loading weights:  24%|██████████████████████████████▌                                                                                                 | 179/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.gate.weight]Loading weights:  25%|██████████████████████████▌                                                                                 | 185/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  25%|██████████████████████████▌                                                                                 | 185/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  24%|█████████████████████████▉                                                                                  | 180/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.down_proj.weight]Loading weights:  25%|████████████████████████████▉                                                                                        | 186/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_b_proj.weight]Loading weights:  24%|█████████████████████████▉                                                                                  | 180/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.down_proj.weight]Loading weights:  25%|████████████████████████████▉                                                                                        | 186/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_b_proj.weight]Loading weights:  24%|██████████████████████████                                                                                  | 181/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.gate_proj.weight]Loading weights:  25%|█████████████████████████████▉                                                                                          | 187/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  24%|██████████████████████████                                                                                  | 181/751 [00:33<02:18,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.gate_proj.weight]Loading weights:  25%|█████████████████████████████▉                                                                                          | 187/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  25%|████████████████████████████▎                                                                                    | 188/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_layernorm.weight]Loading weights:  24%|██████████████████████████▋                                                                                   | 182/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.up_proj.weight]Loading weights:  25%|████████████████████████████▎                                                                                    | 188/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_layernorm.weight]Loading weights:  24%|██████████████████████████▋                                                                                   | 182/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.mlp.shared_experts.up_proj.weight]Loading weights:  25%|█████████████████████████████▋                                                                                        | 189/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_proj.weight]Loading weights:  24%|███████████████████████████▎                                                                                    | 183/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  24%|███████████████████████████▎                                                                                    | 183/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  25%|█████████████████████████████▋                                                                                        | 189/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_proj.weight]Loading weights:  25%|███████████████████████████▍                                                                                    | 184/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_layernorm.weight]Loading weights:  25%|█████████████████████████████▊                                                                                        | 190/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.11.self_attn.q_b_proj.weight]Loading weights:  25%|███████████████████████████▍                                                                                    | 184/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_layernorm.weight]Loading weights:  25%|█████████████████████████████▊                                                                                        | 190/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.11.self_attn.q_b_proj.weight]Loading weights:  25%|██████████████████████████▌                                                                                 | 185/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  25%|██████████████████████████████▊                                                                                          | 191/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  25%|██████████████████████████▌                                                                                 | 185/751 [00:33<02:17,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  25%|██████████████████████████████▊                                                                                          | 191/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  26%|███████████████████████████████▏                                                                                          | 192/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.mlp.experts.down_proj]Loading weights:  25%|████████████████████████████▉                                                                                        | 186/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_b_proj.weight]Loading weights:  26%|███████████████████████████████▏                                                                                          | 192/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.mlp.experts.down_proj]Loading weights:  25%|████████████████████████████▉                                                                                        | 186/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.kv_b_proj.weight]Loading weights:  25%|█████████████████████████████▉                                                                                          | 187/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  25%|█████████████████████████████▉                                                                                          | 187/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  25%|████████████████████████████▎                                                                                    | 188/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_layernorm.weight]Loading weights:  25%|████████████████████████████▎                                                                                    | 188/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_layernorm.weight]Loading weights:  25%|█████████████████████████████▋                                                                                        | 189/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_proj.weight]Loading weights:  25%|█████████████████████████████▋                                                                                        | 189/751 [00:33<02:16,  4.13it/s, Materializing param=model.layers.11.self_attn.q_a_proj.weight]Loading weights:  25%|█████████████████████████████▊                                                                                        | 190/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.11.self_attn.q_b_proj.weight]Loading weights:  25%|█████████████████████████████▊                                                                                        | 190/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.11.self_attn.q_b_proj.weight]Loading weights:  25%|██████████████████████████████▊                                                                                          | 191/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  25%|██████████████████████████████▊                                                                                          | 191/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  26%|███████████████████████████████▏                                                                                          | 192/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.mlp.experts.down_proj]Loading weights:  26%|███████████████████████████████▏                                                                                          | 192/751 [00:33<02:15,  4.13it/s, Materializing param=model.layers.12.mlp.experts.down_proj]Loading weights:  26%|███████████████████████████████▎                                                                                          | 193/751 [00:34<01:28,  6.33it/s, Materializing param=model.layers.12.mlp.experts.down_proj]Loading weights:  26%|███████████████████████████████▎                                                                                          | 193/751 [00:34<01:28,  6.33it/s, Materializing param=model.layers.12.mlp.experts.down_proj]Loading weights:  26%|██████████████████████████████▌                                                                                        | 193/751 [00:34<01:28,  6.33it/s, Materializing param=model.layers.12.mlp.experts.gate_up_proj]Loading weights:  26%|██████████████████████████████▌                                                                                        | 193/751 [00:34<01:28,  6.33it/s, Materializing param=model.layers.12.mlp.experts.gate_up_proj]Loading weights:  26%|██████████████████████████████▌                                                                                        | 193/751 [00:34<01:28,  6.33it/s, Materializing param=model.layers.12.mlp.experts.gate_up_proj]Loading weights:  26%|██████████████████████████████▌                                                                                        | 193/751 [00:34<01:28,  6.33it/s, Materializing param=model.layers.12.mlp.experts.gate_up_proj]Loading weights:  26%|██████████████████████████████▋                                                                                        | 194/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.experts.gate_up_proj]Loading weights:  26%|████████████████████████████▋                                                                                  | 194/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.e_score_correction_bias]Loading weights:  26%|████████████████████████████▋                                                                                  | 194/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.e_score_correction_bias]Loading weights:  26%|█████████████████████████████████▏                                                                                              | 195/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.weight]Loading weights:  26%|█████████████████████████████████▏                                                                                              | 195/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.weight]Loading weights:  26%|████████████████████████████▏                                                                               | 196/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.down_proj.weight]Loading weights:  26%|████████████████████████████▏                                                                               | 196/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.down_proj.weight]Loading weights:  26%|████████████████████████████▎                                                                               | 197/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.gate_proj.weight]Loading weights:  26%|████████████████████████████▎                                                                               | 197/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.gate_proj.weight]Loading weights:  26%|█████████████████████████████                                                                                 | 198/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.up_proj.weight]Loading weights:  26%|█████████████████████████████                                                                                 | 198/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.up_proj.weight]Loading weights:  26%|█████████████████████████████▋                                                                                  | 199/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  26%|█████████████████████████████▋                                                                                  | 199/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  27%|█████████████████████████████▊                                                                                  | 200/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_layernorm.weight]Loading weights:  27%|█████████████████████████████▊                                                                                  | 200/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_layernorm.weight]Loading weights:  27%|████████████████████████████▉                                                                               | 201/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  27%|████████████████████████████▉                                                                               | 201/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  27%|███████████████████████████████▍                                                                                     | 202/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_b_proj.weight]Loading weights:  27%|███████████████████████████████▍                                                                                     | 202/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_b_proj.weight]Loading weights:  26%|██████████████████████████████▋                                                                                        | 194/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.experts.gate_up_proj]Loading weights:  27%|████████████████████████████████▍                                                                                       | 203/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  27%|████████████████████████████████▍                                                                                       | 203/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  26%|████████████████████████████▋                                                                                  | 194/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.e_score_correction_bias]Loading weights:  27%|██████████████████████████████▋                                                                                  | 204/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_layernorm.weight]Loading weights:  26%|████████████████████████████▋                                                                                  | 194/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.e_score_correction_bias]Loading weights:  27%|██████████████████████████████▋                                                                                  | 204/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_layernorm.weight]Loading weights:  26%|█████████████████████████████████▏                                                                                              | 195/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.weight]Loading weights:  27%|████████████████████████████████▏                                                                                     | 205/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_proj.weight]Loading weights:  27%|████████████████████████████████▏                                                                                     | 205/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_proj.weight]Loading weights:  26%|█████████████████████████████████▏                                                                                              | 195/751 [00:37<02:22,  3.91it/s, Materializing param=model.layers.12.mlp.gate.weight]Loading weights:  27%|████████████████████████████████▎                                                                                     | 206/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_b_proj.weight]Loading weights:  26%|████████████████████████████▏                                                                               | 196/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.down_proj.weight]Loading weights:  27%|████████████████████████████████▎                                                                                     | 206/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_b_proj.weight]Loading weights:  26%|████████████████████████████▏                                                                               | 196/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.down_proj.weight]Loading weights:  28%|█████████████████████████████████▎                                                                                       | 207/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  26%|████████████████████████████▎                                                                               | 197/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.gate_proj.weight]Loading weights:  28%|█████████████████████████████████▎                                                                                       | 207/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  26%|████████████████████████████▎                                                                               | 197/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.gate_proj.weight]Loading weights:  28%|█████████████████████████████████▊                                                                                        | 208/751 [00:37<02:18,  3.91it/s, Materializing param=model.layers.13.mlp.experts.down_proj]Loading weights:  28%|█████████████████████████████████▊                                                                                        | 208/751 [00:37<02:18,  3.91it/s, Materializing param=model.layers.13.mlp.experts.down_proj]Loading weights:  26%|█████████████████████████████                                                                                 | 198/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.up_proj.weight]Loading weights:  26%|█████████████████████████████                                                                                 | 198/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.mlp.shared_experts.up_proj.weight]Loading weights:  26%|█████████████████████████████▋                                                                                  | 199/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  26%|█████████████████████████████▋                                                                                  | 199/751 [00:37<02:21,  3.91it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  27%|█████████████████████████████▊                                                                                  | 200/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_layernorm.weight]Loading weights:  27%|█████████████████████████████▊                                                                                  | 200/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_layernorm.weight]Loading weights:  27%|████████████████████████████▉                                                                               | 201/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  27%|████████████████████████████▉                                                                               | 201/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  27%|███████████████████████████████▍                                                                                     | 202/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_b_proj.weight]Loading weights:  27%|███████████████████████████████▍                                                                                     | 202/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.kv_b_proj.weight]Loading weights:  27%|████████████████████████████████▍                                                                                       | 203/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  27%|████████████████████████████████▍                                                                                       | 203/751 [00:37<02:20,  3.91it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  27%|██████████████████████████████▋                                                                                  | 204/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_layernorm.weight]Loading weights:  27%|██████████████████████████████▋                                                                                  | 204/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_layernorm.weight]Loading weights:  27%|████████████████████████████████▏                                                                                     | 205/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_proj.weight]Loading weights:  27%|████████████████████████████████▏                                                                                     | 205/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_a_proj.weight]Loading weights:  27%|████████████████████████████████▎                                                                                     | 206/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_b_proj.weight]Loading weights:  27%|████████████████████████████████▎                                                                                     | 206/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.12.self_attn.q_b_proj.weight]Loading weights:  28%|█████████████████████████████████▎                                                                                       | 207/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  28%|█████████████████████████████████▎                                                                                       | 207/751 [00:37<02:19,  3.91it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  28%|█████████████████████████████████▊                                                                                        | 208/751 [00:37<02:18,  3.91it/s, Materializing param=model.layers.13.mlp.experts.down_proj]Loading weights:  28%|█████████████████████████████████▊                                                                                        | 208/751 [00:37<02:18,  3.91it/s, Materializing param=model.layers.13.mlp.experts.down_proj]Loading weights:  28%|█████████████████████████████████▉                                                                                        | 209/751 [00:38<01:28,  6.16it/s, Materializing param=model.layers.13.mlp.experts.down_proj]Loading weights:  28%|█████████████████████████████████▉                                                                                        | 209/751 [00:38<01:28,  6.16it/s, Materializing param=model.layers.13.mlp.experts.down_proj]Loading weights:  28%|█████████████████████████████████                                                                                      | 209/751 [00:38<01:28,  6.16it/s, Materializing param=model.layers.13.mlp.experts.gate_up_proj]Loading weights:  28%|█████████████████████████████████                                                                                      | 209/751 [00:38<01:28,  6.16it/s, Materializing param=model.layers.13.mlp.experts.gate_up_proj]Loading weights:  28%|█████████████████████████████████                                                                                      | 209/751 [00:38<01:28,  6.16it/s, Materializing param=model.layers.13.mlp.experts.gate_up_proj]Loading weights:  28%|█████████████████████████████████                                                                                      | 209/751 [00:38<01:28,  6.16it/s, Materializing param=model.layers.13.mlp.experts.gate_up_proj]Loading weights:  28%|█████████████████████████████████▎                                                                                     | 210/751 [00:40<02:14,  4.03it/s, Materializing param=model.layers.13.mlp.experts.gate_up_proj]Loading weights:  28%|███████████████████████████████                                                                                | 210/751 [00:40<02:14,  4.03it/s, Materializing param=model.layers.13.mlp.gate.e_score_correction_bias]Loading weights:  28%|███████████████████████████████                                                                                | 210/751 [00:40<02:14,  4.03it/s, Materializing param=model.layers.13.mlp.gate.e_score_correction_bias]Loading weights:  28%|███████████████████████████████████▉                                                                                            | 211/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.gate.weight]Loading weights:  28%|███████████████████████████████████▉                                                                                            | 211/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.gate.weight]Loading weights:  28%|██████████████████████████████▍                                                                             | 212/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.down_proj.weight]Loading weights:  28%|██████████████████████████████▍                                                                             | 212/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.down_proj.weight]Loading weights:  28%|██████████████████████████████▋                                                                             | 213/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.gate_proj.weight]Loading weights:  28%|██████████████████████████████▋                                                                             | 213/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.gate_proj.weight]Loading weights:  28%|███████████████████████████████▎                                                                              | 214/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.up_proj.weight]Loading weights:  28%|███████████████████████████████▎                                                                              | 214/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.up_proj.weight]Loading weights:  29%|████████████████████████████████                                                                                | 215/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  29%|████████████████████████████████                                                                                | 215/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  29%|████████████████████████████████▏                                                                               | 216/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_layernorm.weight]Loading weights:  29%|████████████████████████████████▏                                                                               | 216/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_layernorm.weight]Loading weights:  29%|███████████████████████████████▏                                                                            | 217/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  29%|███████████████████████████████▏                                                                            | 217/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  29%|█████████████████████████████████▉                                                                                   | 218/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_b_proj.weight]Loading weights:  29%|█████████████████████████████████▉                                                                                   | 218/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_b_proj.weight]Loading weights:  29%|██████████████████████████████████▉                                                                                     | 219/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  29%|██████████████████████████████████▉                                                                                     | 219/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  29%|█████████████████████████████████                                                                                | 220/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_layernorm.weight]Loading weights:  29%|█████████████████████████████████                                                                                | 220/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_layernorm.weight]Loading weights:  29%|██████████████████████████████████▋                                                                                   | 221/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_proj.weight]Loading weights:  29%|██████████████████████████████████▋                                                                                   | 221/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_proj.weight]Loading weights:  30%|██████████████████████████████████▉                                                                                   | 222/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_b_proj.weight]Loading weights:  30%|██████████████████████████████████▉                                                                                   | 222/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_b_proj.weight]Loading weights:  30%|███████████████████████████████████▉                                                                                     | 223/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  30%|███████████████████████████████████▉                                                                                     | 223/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  30%|████████████████████████████████████▍                                                                                     | 224/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.mlp.experts.down_proj]Loading weights:  30%|████████████████████████████████████▍                                                                                     | 224/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.mlp.experts.down_proj]Loading weights:  28%|█████████████████████████████████▎                                                                                     | 210/751 [00:40<02:14,  4.03it/s, Materializing param=model.layers.13.mlp.experts.gate_up_proj]Loading weights:  28%|███████████████████████████████                                                                                | 210/751 [00:40<02:14,  4.03it/s, Materializing param=model.layers.13.mlp.gate.e_score_correction_bias]Loading weights:  28%|███████████████████████████████                                                                                | 210/751 [00:40<02:14,  4.03it/s, Materializing param=model.layers.13.mlp.gate.e_score_correction_bias]Loading weights:  28%|███████████████████████████████████▉                                                                                            | 211/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.gate.weight]Loading weights:  28%|███████████████████████████████████▉                                                                                            | 211/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.gate.weight]Loading weights:  28%|██████████████████████████████▍                                                                             | 212/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.down_proj.weight]Loading weights:  28%|██████████████████████████████▍                                                                             | 212/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.down_proj.weight]Loading weights:  28%|██████████████████████████████▋                                                                             | 213/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.gate_proj.weight]Loading weights:  28%|██████████████████████████████▋                                                                             | 213/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.gate_proj.weight]Loading weights:  28%|███████████████████████████████▎                                                                              | 214/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.up_proj.weight]Loading weights:  28%|███████████████████████████████▎                                                                              | 214/751 [00:40<02:13,  4.03it/s, Materializing param=model.layers.13.mlp.shared_experts.up_proj.weight]Loading weights:  29%|████████████████████████████████                                                                                | 215/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  29%|████████████████████████████████                                                                                | 215/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  29%|████████████████████████████████▏                                                                               | 216/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_layernorm.weight]Loading weights:  29%|████████████████████████████████▏                                                                               | 216/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_layernorm.weight]Loading weights:  29%|███████████████████████████████▏                                                                            | 217/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  29%|███████████████████████████████▏                                                                            | 217/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  29%|█████████████████████████████████▉                                                                                   | 218/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_b_proj.weight]Loading weights:  29%|█████████████████████████████████▉                                                                                   | 218/751 [00:40<02:12,  4.03it/s, Materializing param=model.layers.13.self_attn.kv_b_proj.weight]Loading weights:  29%|██████████████████████████████████▉                                                                                     | 219/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  29%|██████████████████████████████████▉                                                                                     | 219/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  29%|█████████████████████████████████                                                                                | 220/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_layernorm.weight]Loading weights:  29%|█████████████████████████████████                                                                                | 220/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_layernorm.weight]Loading weights:  29%|██████████████████████████████████▋                                                                                   | 221/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_proj.weight]Loading weights:  29%|██████████████████████████████████▋                                                                                   | 221/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_a_proj.weight]Loading weights:  30%|██████████████████████████████████▉                                                                                   | 222/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_b_proj.weight]Loading weights:  30%|██████████████████████████████████▉                                                                                   | 222/751 [00:40<02:11,  4.03it/s, Materializing param=model.layers.13.self_attn.q_b_proj.weight]Loading weights:  30%|███████████████████████████████████▉                                                                                     | 223/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  30%|███████████████████████████████████▉                                                                                     | 223/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  30%|████████████████████████████████████▍                                                                                     | 224/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.mlp.experts.down_proj]Loading weights:  30%|████████████████████████████████████▍                                                                                     | 224/751 [00:40<02:10,  4.03it/s, Materializing param=model.layers.14.mlp.experts.down_proj]Loading weights:  30%|████████████████████████████████████▌                                                                                     | 225/751 [00:40<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.experts.down_proj]Loading weights:  30%|████████████████████████████████████▌                                                                                     | 225/751 [00:41<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.experts.down_proj]Loading weights:  30%|███████████████████████████████████▋                                                                                   | 225/751 [00:40<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.experts.gate_up_proj]Loading weights:  30%|███████████████████████████████████▋                                                                                   | 225/751 [00:41<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.experts.gate_up_proj]Loading weights:  30%|███████████████████████████████████▋                                                                                   | 225/751 [00:40<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.experts.gate_up_proj]Loading weights:  30%|███████████████████████████████████▋                                                                                   | 225/751 [00:41<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.experts.gate_up_proj]Loading weights:  30%|█████████████████████████████████▍                                                                             | 226/751 [00:42<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.gate.e_score_correction_bias]Loading weights:  30%|█████████████████████████████████▍                                                                             | 226/751 [00:42<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.gate.e_score_correction_bias]Loading weights:  30%|█████████████████████████████████▌                                                                             | 227/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.gate.e_score_correction_bias]Loading weights:  30%|██████████████████████████████████████▋                                                                                         | 227/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.gate.weight]Loading weights:  30%|██████████████████████████████████████▋                                                                                         | 227/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.gate.weight]Loading weights:  30%|████████████████████████████████▊                                                                           | 228/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.down_proj.weight]Loading weights:  30%|████████████████████████████████▊                                                                           | 228/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.down_proj.weight]Loading weights:  30%|████████████████████████████████▉                                                                           | 229/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.gate_proj.weight]Loading weights:  30%|████████████████████████████████▉                                                                           | 229/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.gate_proj.weight]Loading weights:  31%|█████████████████████████████████▋                                                                            | 230/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.up_proj.weight]Loading weights:  31%|█████████████████████████████████▋                                                                            | 230/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.up_proj.weight]Loading weights:  31%|██████████████████████████████████▍                                                                             | 231/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  31%|██████████████████████████████████▍                                                                             | 231/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  31%|██████████████████████████████████▌                                                                             | 232/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_layernorm.weight]Loading weights:  31%|██████████████████████████████████▌                                                                             | 232/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_layernorm.weight]Loading weights:  31%|█████████████████████████████████▌                                                                          | 233/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  31%|█████████████████████████████████▌                                                                          | 233/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  31%|████████████████████████████████████▍                                                                                | 234/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_b_proj.weight]Loading weights:  31%|████████████████████████████████████▍                                                                                | 234/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_b_proj.weight]Loading weights:  31%|█████████████████████████████████████▌                                                                                  | 235/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  31%|█████████████████████████████████████▌                                                                                  | 235/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  31%|███████████████████████████████████▌                                                                             | 236/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_layernorm.weight]Loading weights:  31%|███████████████████████████████████▌                                                                             | 236/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_layernorm.weight]Loading weights:  32%|█████████████████████████████████████▏                                                                                | 237/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_proj.weight]Loading weights:  32%|█████████████████████████████████████▏                                                                                | 237/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_proj.weight]Loading weights:  32%|█████████████████████████████████████▍                                                                                | 238/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.14.self_attn.q_b_proj.weight]Loading weights:  32%|█████████████████████████████████████▍                                                                                | 238/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.14.self_attn.q_b_proj.weight]Loading weights:  32%|██████████████████████████████████████▌                                                                                  | 239/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  32%|██████████████████████████████████████▌                                                                                  | 239/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  32%|██████████████████████████████████████▉                                                                                   | 240/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.mlp.experts.down_proj]Loading weights:  32%|██████████████████████████████████████▉                                                                                   | 240/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.mlp.experts.down_proj]Loading weights:  30%|█████████████████████████████████▍                                                                             | 226/751 [00:42<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.gate.e_score_correction_bias]Loading weights:  30%|█████████████████████████████████▍                                                                             | 226/751 [00:42<01:20,  6.50it/s, Materializing param=model.layers.14.mlp.gate.e_score_correction_bias]Loading weights:  30%|█████████████████████████████████▌                                                                             | 227/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.gate.e_score_correction_bias]Loading weights:  30%|██████████████████████████████████████▋                                                                                         | 227/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.gate.weight]Loading weights:  30%|██████████████████████████████████████▋                                                                                         | 227/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.gate.weight]Loading weights:  30%|████████████████████████████████▊                                                                           | 228/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.down_proj.weight]Loading weights:  30%|████████████████████████████████▊                                                                           | 228/751 [00:42<01:51,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.down_proj.weight]Loading weights:  30%|████████████████████████████████▉                                                                           | 229/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.gate_proj.weight]Loading weights:  30%|████████████████████████████████▉                                                                           | 229/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.gate_proj.weight]Loading weights:  31%|█████████████████████████████████▋                                                                            | 230/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.up_proj.weight]Loading weights:  31%|█████████████████████████████████▋                                                                            | 230/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.mlp.shared_experts.up_proj.weight]Loading weights:  31%|██████████████████████████████████▍                                                                             | 231/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  31%|██████████████████████████████████▍                                                                             | 231/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  31%|██████████████████████████████████▌                                                                             | 232/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_layernorm.weight]Loading weights:  31%|██████████████████████████████████▌                                                                             | 232/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_layernorm.weight]Loading weights:  31%|█████████████████████████████████▌                                                                          | 233/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  31%|█████████████████████████████████▌                                                                          | 233/751 [00:42<01:50,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  31%|████████████████████████████████████▍                                                                                | 234/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_b_proj.weight]Loading weights:  31%|████████████████████████████████████▍                                                                                | 234/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.kv_b_proj.weight]Loading weights:  31%|█████████████████████████████████████▌                                                                                  | 235/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  31%|█████████████████████████████████████▌                                                                                  | 235/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  31%|███████████████████████████████████▌                                                                             | 236/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_layernorm.weight]Loading weights:  31%|███████████████████████████████████▌                                                                             | 236/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_layernorm.weight]Loading weights:  32%|█████████████████████████████████████▏                                                                                | 237/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_proj.weight]Loading weights:  32%|█████████████████████████████████████▏                                                                                | 237/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_a_proj.weight]Loading weights:  32%|█████████████████████████████████████▍                                                                                | 238/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_b_proj.weight]Loading weights:  32%|█████████████████████████████████████▍                                                                                | 238/751 [00:42<01:49,  4.71it/s, Materializing param=model.layers.14.self_attn.q_b_proj.weight]Loading weights:  32%|██████████████████████████████████████▌                                                                                  | 239/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  32%|██████████████████████████████████████▌                                                                                  | 239/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  32%|██████████████████████████████████████▉                                                                                   | 240/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.mlp.experts.down_proj]Loading weights:  32%|██████████████████████████████████████▉                                                                                   | 240/751 [00:42<01:48,  4.71it/s, Materializing param=model.layers.15.mlp.experts.down_proj]Loading weights:  32%|███████████████████████████████████████▏                                                                                  | 241/751 [00:43<01:15,  6.75it/s, Materializing param=model.layers.15.mlp.experts.down_proj]Loading weights:  32%|███████████████████████████████████████▏                                                                                  | 241/751 [00:43<01:15,  6.75it/s, Materializing param=model.layers.15.mlp.experts.down_proj]Loading weights:  32%|██████████████████████████████████████▏                                                                                | 241/751 [00:43<01:15,  6.75it/s, Materializing param=model.layers.15.mlp.experts.gate_up_proj]Loading weights:  32%|██████████████████████████████████████▏                                                                                | 241/751 [00:43<01:15,  6.75it/s, Materializing param=model.layers.15.mlp.experts.gate_up_proj]Loading weights:  32%|██████████████████████████████████████▏                                                                                | 241/751 [00:43<01:15,  6.75it/s, Materializing param=model.layers.15.mlp.experts.gate_up_proj]Loading weights:  32%|██████████████████████████████████████▏                                                                                | 241/751 [00:43<01:15,  6.75it/s, Materializing param=model.layers.15.mlp.experts.gate_up_proj]Loading weights:  32%|██████████████████████████████████████▎                                                                                | 242/751 [00:45<01:59,  4.27it/s, Materializing param=model.layers.15.mlp.experts.gate_up_proj]Loading weights:  32%|███████████████████████████████████▊                                                                           | 242/751 [00:45<01:59,  4.27it/s, Materializing param=model.layers.15.mlp.gate.e_score_correction_bias]Loading weights:  32%|███████████████████████████████████▊                                                                           | 242/751 [00:45<01:59,  4.27it/s, Materializing param=model.layers.15.mlp.gate.e_score_correction_bias]Loading weights:  32%|█████████████████████████████████████████▍                                                                                      | 243/751 [00:45<01:59,  4.27it/s, Materializing param=model.layers.15.mlp.gate.weight]Loading weights:  32%|█████████████████████████████████████████▍                                                                                      | 243/751 [00:45<01:59,  4.27it/s, Materializing param=model.layers.15.mlp.gate.weight]Loading weights:  32%|███████████████████████████████████                                                                         | 244/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.mlp.shared_experts.down_proj.weight]Loading weights:  32%|███████████████████████████████████                                                                         | 244/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.mlp.shared_experts.down_proj.weight]Loading weights:  33%|███████████████████████████████████▏                                                                        | 245/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.mlp.shared_experts.gate_proj.weight]Loading weights:  33%|███████████████████████████████████▏                                                                        | 245/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.mlp.shared_experts.gate_proj.weight]Loading weights:  33%|████████████████████████████████████                                                                          | 246/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.mlp.shared_experts.up_proj.weight]Loading weights:  33%|████████████████████████████████████                                                                          | 246/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.mlp.shared_experts.up_proj.weight]Loading weights:  33%|████████████████████████████████████▊                                                                           | 247/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  33%|████████████████████████████████████▊                                                                           | 247/751 [00:45<01:58,  4.27it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  33%|████████████████████████████████████▉                                                                           | 248/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.kv_a_layernorm.weight]Loading weights:  33%|████████████████████████████████████▉                                                                           | 248/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.kv_a_layernorm.weight]Loading weights:  33%|███████████████████████████████████▊                                                                        | 249/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  33%|███████████████████████████████████▊                                                                        | 249/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  33%|██████████████████████████████████████▉                                                                              | 250/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.kv_b_proj.weight]Loading weights:  33%|██████████████████████████████████████▉                                                                              | 250/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.kv_b_proj.weight]Loading weights:  33%|████████████████████████████████████████                                                                                | 251/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  33%|████████████████████████████████████████                                                                                | 251/751 [00:45<01:57,  4.27it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  34%|█████████████████████████████████████▉                                                                           | 252/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.15.self_attn.q_a_layernorm.weight]Loading weights:  34%|█████████████████████████████████████▉                                                                           | 252/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.15.self_attn.q_a_layernorm.weight]Loading weights:  34%|███████████████████████████████████████▊                                                                              | 253/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.15.self_attn.q_a_proj.weight]Loading weights:  34%|███████████████████████████████████████▊                                                                              | 253/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.15.self_attn.q_a_proj.weight]Loading weights:  34%|███████████████████████████████████████▉                                                                              | 254/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.15.self_attn.q_b_proj.weight]Loading weights:  34%|███████████████████████████████████████▉                                                                              | 254/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.15.self_attn.q_b_proj.weight]Loading weights:  34%|█████████████████████████████████████████                                                                                | 255/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  34%|█████████████████████████████████████████                                                                                | 255/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  34%|█████████████████████████████████████████▌                                                                                | 256/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.16.mlp.experts.down_proj]Loading weights:  32%|██████████████████████████████████████▎                                                                                | 242/751 [00:45<01:59,  4.26it/s, Materializing param=model.layers.15.mlp.experts.gate_up_proj]Loading weights:  34%|█████████████████████████████████████████▌                                                                                | 256/751 [00:45<01:56,  4.27it/s, Materializing param=model.layers.16.mlp.experts.down_proj]Loading weights:  32%|███████████████████████████████████▊                                                                           | 242/751 [00:45<01:59,  4.26it/s, Materializing param=model.layers.15.mlp.gate.e_score_correction_bias]Loading weights:  32%|███████████████████████████████████▊                                                                           | 242/751 [00:45<01:59,  4.26it/s, Materializing param=model.layers.15.mlp.gate.e_score_correction_bias]Loading weights:  32%|█████████████████████████████████████████▍                                                                                      | 243/751 [00:45<01:59,  4.26it/s, Materializing param=model.layers.15.mlp.gate.weight]Loading weights:  32%|█████████████████████████████████████████▍                                                                                      | 243/751 [00:45<01:59,  4.26it/s, Materializing param=model.layers.15.mlp.gate.weight]Loading weights:  32%|███████████████████████████████████                                                                         | 244/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.mlp.shared_experts.down_proj.weight]Loading weights:  32%|███████████████████████████████████                                                                         | 244/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.mlp.shared_experts.down_proj.weight]Loading weights:  33%|███████████████████████████████████▏                                                                        | 245/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.mlp.shared_experts.gate_proj.weight]Loading weights:  33%|███████████████████████████████████▏                                                                        | 245/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.mlp.shared_experts.gate_proj.weight]Loading weights:  33%|████████████████████████████████████                                                                          | 246/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.mlp.shared_experts.up_proj.weight]Loading weights:  33%|████████████████████████████████████                                                                          | 246/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.mlp.shared_experts.up_proj.weight]Loading weights:  33%|████████████████████████████████████▊                                                                           | 247/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  33%|████████████████████████████████████▊                                                                           | 247/751 [00:45<01:58,  4.26it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  33%|████████████████████████████████████▉                                                                           | 248/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.kv_a_layernorm.weight]Loading weights:  33%|████████████████████████████████████▉                                                                           | 248/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.kv_a_layernorm.weight]Loading weights:  33%|███████████████████████████████████▊                                                                        | 249/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  33%|███████████████████████████████████▊                                                                        | 249/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  33%|██████████████████████████████████████▉                                                                              | 250/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.kv_b_proj.weight]Loading weights:  33%|██████████████████████████████████████▉                                                                              | 250/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.kv_b_proj.weight]Loading weights:  33%|████████████████████████████████████████                                                                                | 251/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  33%|████████████████████████████████████████                                                                                | 251/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  34%|█████████████████████████████████████▉                                                                           | 252/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.q_a_layernorm.weight]Loading weights:  34%|█████████████████████████████████████▉                                                                           | 252/751 [00:45<01:57,  4.26it/s, Materializing param=model.layers.15.self_attn.q_a_layernorm.weight]Loading weights:  34%|███████████████████████████████████████▊                                                                              | 253/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.15.self_attn.q_a_proj.weight]Loading weights:  34%|███████████████████████████████████████▊                                                                              | 253/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.15.self_attn.q_a_proj.weight]Loading weights:  34%|███████████████████████████████████████▉                                                                              | 254/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.15.self_attn.q_b_proj.weight]Loading weights:  34%|███████████████████████████████████████▉                                                                              | 254/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.15.self_attn.q_b_proj.weight]Loading weights:  34%|█████████████████████████████████████████                                                                                | 255/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  34%|█████████████████████████████████████████                                                                                | 255/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  34%|█████████████████████████████████████████▌                                                                                | 256/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.16.mlp.experts.down_proj]Loading weights:  34%|█████████████████████████████████████████▌                                                                                | 256/751 [00:45<01:56,  4.26it/s, Materializing param=model.layers.16.mlp.experts.down_proj]Loading weights:  34%|█████████████████████████████████████████▋                                                                                | 257/751 [00:47<01:20,  6.14it/s, Materializing param=model.layers.16.mlp.experts.down_proj]Loading weights:  34%|█████████████████████████████████████████▋                                                                                | 257/751 [00:47<01:20,  6.14it/s, Materializing param=model.layers.16.mlp.experts.down_proj]Loading weights:  34%|████████████████████████████████████████▋                                                                              | 257/751 [00:47<01:20,  6.14it/s, Materializing param=model.layers.16.mlp.experts.gate_up_proj]Loading weights:  34%|████████████████████████████████████████▋                                                                              | 257/751 [00:47<01:20,  6.14it/s, Materializing param=model.layers.16.mlp.experts.gate_up_proj]Loading weights:  34%|████████████████████████████████████████▋                                                                              | 257/751 [00:47<01:20,  6.14it/s, Materializing param=model.layers.16.mlp.experts.gate_up_proj]Loading weights:  34%|████████████████████████████████████████▋                                                                              | 257/751 [00:47<01:20,  6.14it/s, Materializing param=model.layers.16.mlp.experts.gate_up_proj]Loading weights:  34%|████████████████████████████████████████▉                                                                              | 258/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.experts.gate_up_proj]Loading weights:  34%|██████████████████████████████████████▏                                                                        | 258/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.e_score_correction_bias]Loading weights:  34%|██████████████████████████████████████▏                                                                        | 258/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.e_score_correction_bias]Loading weights:  34%|████████████████████████████████████████████▏                                                                                   | 259/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.weight]Loading weights:  34%|████████████████████████████████████████████▏                                                                                   | 259/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.weight]Loading weights:  35%|█████████████████████████████████████▍                                                                      | 260/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.down_proj.weight]Loading weights:  35%|█████████████████████████████████████▍                                                                      | 260/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.down_proj.weight]Loading weights:  35%|█████████████████████████████████████▌                                                                      | 261/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.gate_proj.weight]Loading weights:  35%|█████████████████████████████████████▌                                                                      | 261/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.gate_proj.weight]Loading weights:  35%|██████████████████████████████████████▍                                                                       | 262/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.up_proj.weight]Loading weights:  35%|██████████████████████████████████████▍                                                                       | 262/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.up_proj.weight]Loading weights:  35%|███████████████████████████████████████▏                                                                        | 263/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  35%|███████████████████████████████████████▏                                                                        | 263/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  35%|███████████████████████████████████████▎                                                                        | 264/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_layernorm.weight]Loading weights:  35%|███████████████████████████████████████▎                                                                        | 264/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_layernorm.weight]Loading weights:  35%|██████████████████████████████████████                                                                      | 265/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  35%|██████████████████████████████████████                                                                      | 265/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  35%|█████████████████████████████████████████▍                                                                           | 266/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_b_proj.weight]Loading weights:  35%|█████████████████████████████████████████▍                                                                           | 266/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_b_proj.weight]Loading weights:  36%|██████████████████████████████████████████▋                                                                             | 267/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  36%|██████████████████████████████████████████▋                                                                             | 267/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  36%|████████████████████████████████████████▎                                                                        | 268/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_layernorm.weight]Loading weights:  36%|████████████████████████████████████████▎                                                                        | 268/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_layernorm.weight]Loading weights:  36%|██████████████████████████████████████████▎                                                                           | 269/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_proj.weight]Loading weights:  36%|██████████████████████████████████████████▎                                                                           | 269/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_proj.weight]Loading weights:  36%|██████████████████████████████████████████▍                                                                           | 270/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_b_proj.weight]Loading weights:  36%|██████████████████████████████████████████▍                                                                           | 270/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_b_proj.weight]Loading weights:  36%|███████████████████████████████████████████▋                                                                             | 271/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  36%|███████████████████████████████████████████▋                                                                             | 271/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  36%|████████████████████████████████████████████▏                                                                             | 272/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.mlp.experts.down_proj]Loading weights:  36%|████████████████████████████████████████████▏                                                                             | 272/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.mlp.experts.down_proj]Loading weights:  34%|████████████████████████████████████████▉                                                                              | 258/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.experts.gate_up_proj]Loading weights:  34%|██████████████████████████████████████▏                                                                        | 258/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.e_score_correction_bias]Loading weights:  34%|██████████████████████████████████████▏                                                                        | 258/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.e_score_correction_bias]Loading weights:  34%|████████████████████████████████████████████▏                                                                                   | 259/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.weight]Loading weights:  34%|████████████████████████████████████████████▏                                                                                   | 259/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.gate.weight]Loading weights:  35%|█████████████████████████████████████▍                                                                      | 260/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.down_proj.weight]Loading weights:  35%|█████████████████████████████████████▍                                                                      | 260/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.down_proj.weight]Loading weights:  35%|█████████████████████████████████████▌                                                                      | 261/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.gate_proj.weight]Loading weights:  35%|█████████████████████████████████████▌                                                                      | 261/751 [00:49<02:12,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.gate_proj.weight]Loading weights:  35%|██████████████████████████████████████▍                                                                       | 262/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.up_proj.weight]Loading weights:  35%|██████████████████████████████████████▍                                                                       | 262/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.mlp.shared_experts.up_proj.weight]Loading weights:  35%|███████████████████████████████████████▏                                                                        | 263/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  35%|███████████████████████████████████████▏                                                                        | 263/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  35%|███████████████████████████████████████▎                                                                        | 264/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_layernorm.weight]Loading weights:  35%|███████████████████████████████████████▎                                                                        | 264/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_layernorm.weight]Loading weights:  35%|██████████████████████████████████████                                                                      | 265/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  35%|██████████████████████████████████████                                                                      | 265/751 [00:49<02:11,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  35%|█████████████████████████████████████████▍                                                                           | 266/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_b_proj.weight]Loading weights:  35%|█████████████████████████████████████████▍                                                                           | 266/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.kv_b_proj.weight]Loading weights:  36%|██████████████████████████████████████████▋                                                                             | 267/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  36%|██████████████████████████████████████████▋                                                                             | 267/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  36%|████████████████████████████████████████▎                                                                        | 268/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_layernorm.weight]Loading weights:  36%|████████████████████████████████████████▎                                                                        | 268/751 [00:49<02:10,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_layernorm.weight]Loading weights:  36%|██████████████████████████████████████████▎                                                                           | 269/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_proj.weight]Loading weights:  36%|██████████████████████████████████████████▎                                                                           | 269/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_a_proj.weight]Loading weights:  36%|██████████████████████████████████████████▍                                                                           | 270/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_b_proj.weight]Loading weights:  36%|██████████████████████████████████████████▍                                                                           | 270/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.16.self_attn.q_b_proj.weight]Loading weights:  36%|███████████████████████████████████████████▋                                                                             | 271/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  36%|███████████████████████████████████████████▋                                                                             | 271/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  36%|████████████████████████████████████████████▏                                                                             | 272/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.mlp.experts.down_proj]Loading weights:  36%|████████████████████████████████████████████▏                                                                             | 272/751 [00:49<02:09,  3.71it/s, Materializing param=model.layers.17.mlp.experts.down_proj]Loading weights:  36%|████████████████████████████████████████████▎                                                                             | 273/751 [00:50<01:28,  5.43it/s, Materializing param=model.layers.17.mlp.experts.down_proj]Loading weights:  36%|████████████████████████████████████████████▎                                                                             | 273/751 [00:51<01:28,  5.43it/s, Materializing param=model.layers.17.mlp.experts.down_proj]Loading weights:  36%|███████████████████████████████████████████▎                                                                           | 273/751 [00:50<01:28,  5.43it/s, Materializing param=model.layers.17.mlp.experts.gate_up_proj]Loading weights:  36%|███████████████████████████████████████████▎                                                                           | 273/751 [00:51<01:28,  5.43it/s, Materializing param=model.layers.17.mlp.experts.gate_up_proj]Loading weights:  36%|███████████████████████████████████████████▎                                                                           | 273/751 [00:50<01:28,  5.43it/s, Materializing param=model.layers.17.mlp.experts.gate_up_proj]Loading weights:  36%|███████████████████████████████████████████▎                                                                           | 273/751 [00:51<01:28,  5.43it/s, Materializing param=model.layers.17.mlp.experts.gate_up_proj]Loading weights:  36%|███████████████████████████████████████████▍                                                                           | 274/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.experts.gate_up_proj]Loading weights:  36%|████████████████████████████████████████▍                                                                      | 274/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.gate.e_score_correction_bias]Loading weights:  36%|████████████████████████████████████████▍                                                                      | 274/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.gate.e_score_correction_bias]Loading weights:  37%|██████████████████████████████████████████████▊                                                                                 | 275/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.gate.weight]Loading weights:  37%|██████████████████████████████████████████████▊                                                                                 | 275/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.gate.weight]Loading weights:  37%|███████████████████████████████████████▋                                                                    | 276/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.shared_experts.down_proj.weight]Loading weights:  37%|███████████████████████████████████████▋                                                                    | 276/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.shared_experts.down_proj.weight]Loading weights:  37%|███████████████████████████████████████▊                                                                    | 277/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.shared_experts.gate_proj.weight]Loading weights:  37%|███████████████████████████████████████▊                                                                    | 277/751 [00:53<02:26,  3.25it/s, Materializing param=model.layers.17.mlp.shared_experts.gate_proj.weight]Loading weights:  37%|████████████████████████████████████████▋                                                                     | 278/751 [00:53<02:25,  3.25it/s, Materializing param=model.layers.17.mlp.shared_experts.up_proj.weight]Loading weights:  37%|████████████████████████████████████████▋                                                                     | 278/751 [00:53<02:25,  3.25it/s, Materializing param=model.layers.17.mlp.shared_experts.up_proj.weight]Loading weights:  37%|█████████████████████████████████████████▌                                                                      | 279/751 [00:53<02:25,  3.25it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████▌                                                                      | 279/751 [00:53<02:25,  3.25it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████▊                                                                      | 280/751 [00:53<02:25,  3.25it/s, Materializing param=model.layers.17.self_attn.kv_a_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████▊                                                                      | 280/751 [00:53<02:25,  3.25it/s, Materializing param=model.layers.17.self_attn.kv_a_layernorm.weight]Loading weights:  37%|████████████████████████████████████████▍                                                                   | 281/751 [00:53<02:24,  3.25it/s, Materializing param=model.layers.17.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  37%|████████████████████████████████████████▍                                                                   | 281/751 [00:53<02:24,  3.25it/s, Materializing param=model.layers.17.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  38%|███████████████████████████████████████████▉                                                                         | 282/751 [00:53<02:24,  3.25it/s, Materializing param=model.layers.17.self_attn.kv_b_proj.weight]Loading weights:  38%|███████████████████████████████████████████▉                                                                         | 282/751 [00:53<02:24,  3.25it/s, Materializing param=model.layers.17.self_attn.kv_b_proj.weight]Loading weights:  38%|█████████████████████████████████████████████▏                                                                          | 283/751 [00:53<02:24,  3.25it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  38%|█████████████████████████████████████████████▏                                                                          | 283/751 [00:53<02:24,  3.25it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  38%|██████████████████████████████████████████▋                                                                      | 284/751 [00:53<02:23,  3.25it/s, Materializing param=model.layers.17.self_attn.q_a_layernorm.weight]Loading weights:  38%|██████████████████████████████████████████▋                                                                      | 284/751 [00:53<02:23,  3.25it/s, Materializing param=model.layers.17.self_attn.q_a_layernorm.weight]Loading weights:  38%|████████████████████████████████████████████▊                                                                         | 285/751 [00:53<02:23,  3.25it/s, Materializing param=model.layers.17.self_attn.q_a_proj.weight]Loading weights:  38%|████████████████████████████████████████████▊                                                                         | 285/751 [00:53<02:23,  3.25it/s, Materializing param=model.layers.17.self_attn.q_a_proj.weight]Loading weights:  38%|████████████████████████████████████████████▉                                                                         | 286/751 [00:53<02:23,  3.25it/s, Materializing param=model.layers.17.self_attn.q_b_proj.weight]Loading weights:  38%|████████████████████████████████████████████▉                                                                         | 286/751 [00:53<02:23,  3.25it/s, Materializing param=model.layers.17.self_attn.q_b_proj.weight]Loading weights:  38%|██████████████████████████████████████████████▏                                                                          | 287/751 [00:53<02:22,  3.25it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  38%|██████████████████████████████████████████████▏                                                                          | 287/751 [00:53<02:22,  3.25it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  38%|██████████████████████████████████████████████▊                                                                           | 288/751 [00:53<02:22,  3.25it/s, Materializing param=model.layers.18.mlp.experts.down_proj]Loading weights:  38%|██████████████████████████████████████████████▊                                                                           | 288/751 [00:53<02:22,  3.25it/s, Materializing param=model.layers.18.mlp.experts.down_proj]Loading weights:  36%|███████████████████████████████████████████▍                                                                           | 274/751 [00:54<02:27,  3.24it/s, Materializing param=model.layers.17.mlp.experts.gate_up_proj]Loading weights:  36%|████████████████████████████████████████▍                                                                      | 274/751 [00:54<02:27,  3.24it/s, Materializing param=model.layers.17.mlp.gate.e_score_correction_bias]Loading weights:  36%|████████████████████████████████████████▍                                                                      | 274/751 [00:54<02:27,  3.24it/s, Materializing param=model.layers.17.mlp.gate.e_score_correction_bias]Loading weights:  37%|██████████████████████████████████████████████▊                                                                                 | 275/751 [00:54<02:26,  3.24it/s, Materializing param=model.layers.17.mlp.gate.weight]Loading weights:  37%|██████████████████████████████████████████████▊                                                                                 | 275/751 [00:54<02:26,  3.24it/s, Materializing param=model.layers.17.mlp.gate.weight]Loading weights:  37%|███████████████████████████████████████▋                                                                    | 276/751 [00:54<02:26,  3.24it/s, Materializing param=model.layers.17.mlp.shared_experts.down_proj.weight]Loading weights:  37%|███████████████████████████████████████▋                                                                    | 276/751 [00:54<02:26,  3.24it/s, Materializing param=model.layers.17.mlp.shared_experts.down_proj.weight]Loading weights:  37%|███████████████████████████████████████▊                                                                    | 277/751 [00:54<02:26,  3.24it/s, Materializing param=model.layers.17.mlp.shared_experts.gate_proj.weight]Loading weights:  37%|███████████████████████████████████████▊                                                                    | 277/751 [00:54<02:26,  3.24it/s, Materializing param=model.layers.17.mlp.shared_experts.gate_proj.weight]Loading weights:  37%|████████████████████████████████████████▋                                                                     | 278/751 [00:54<02:25,  3.24it/s, Materializing param=model.layers.17.mlp.shared_experts.up_proj.weight]Loading weights:  37%|████████████████████████████████████████▋                                                                     | 278/751 [00:54<02:25,  3.24it/s, Materializing param=model.layers.17.mlp.shared_experts.up_proj.weight]Loading weights:  37%|█████████████████████████████████████████▌                                                                      | 279/751 [00:54<02:25,  3.24it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████▌                                                                      | 279/751 [00:54<02:25,  3.24it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████▊                                                                      | 280/751 [00:54<02:25,  3.24it/s, Materializing param=model.layers.17.self_attn.kv_a_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████▊                                                                      | 280/751 [00:54<02:25,  3.24it/s, Materializing param=model.layers.17.self_attn.kv_a_layernorm.weight]Loading weights:  37%|████████████████████████████████████████▍                                                                   | 281/751 [00:54<02:24,  3.24it/s, Materializing param=model.layers.17.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  37%|████████████████████████████████████████▍                                                                   | 281/751 [00:54<02:24,  3.24it/s, Materializing param=model.layers.17.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  38%|███████████████████████████████████████████▉                                                                         | 282/751 [00:54<02:24,  3.24it/s, Materializing param=model.layers.17.self_attn.kv_b_proj.weight]Loading weights:  38%|███████████████████████████████████████████▉                                                                         | 282/751 [00:54<02:24,  3.24it/s, Materializing param=model.layers.17.self_attn.kv_b_proj.weight]Loading weights:  38%|█████████████████████████████████████████████▏                                                                          | 283/751 [00:54<02:24,  3.24it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  38%|█████████████████████████████████████████████▏                                                                          | 283/751 [00:54<02:24,  3.24it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  38%|██████████████████████████████████████████▋                                                                      | 284/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.17.self_attn.q_a_layernorm.weight]Loading weights:  38%|██████████████████████████████████████████▋                                                                      | 284/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.17.self_attn.q_a_layernorm.weight]Loading weights:  38%|████████████████████████████████████████████▊                                                                         | 285/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.17.self_attn.q_a_proj.weight]Loading weights:  38%|████████████████████████████████████████████▊                                                                         | 285/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.17.self_attn.q_a_proj.weight]Loading weights:  38%|████████████████████████████████████████████▉                                                                         | 286/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.17.self_attn.q_b_proj.weight]Loading weights:  38%|████████████████████████████████████████████▉                                                                         | 286/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.17.self_attn.q_b_proj.weight]Loading weights:  38%|██████████████████████████████████████████████▏                                                                          | 287/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  38%|██████████████████████████████████████████████▏                                                                          | 287/751 [00:54<02:23,  3.24it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  38%|██████████████████████████████████████████████▊                                                                           | 288/751 [00:54<02:22,  3.24it/s, Materializing param=model.layers.18.mlp.experts.down_proj]Loading weights:  38%|██████████████████████████████████████████████▊                                                                           | 288/751 [00:54<02:22,  3.24it/s, Materializing param=model.layers.18.mlp.experts.down_proj]Loading weights:  38%|██████████████████████████████████████████████▉                                                                           | 289/751 [00:55<01:34,  4.91it/s, Materializing param=model.layers.18.mlp.experts.down_proj]Loading weights:  38%|██████████████████████████████████████████████▉                                                                           | 289/751 [00:55<01:34,  4.91it/s, Materializing param=model.layers.18.mlp.experts.down_proj]Loading weights:  38%|█████████████████████████████████████████████▊                                                                         | 289/751 [00:55<01:34,  4.91it/s, Materializing param=model.layers.18.mlp.experts.gate_up_proj]Loading weights:  38%|█████████████████████████████████████████████▊                                                                         | 289/751 [00:55<01:34,  4.91it/s, Materializing param=model.layers.18.mlp.experts.gate_up_proj]Loading weights:  38%|█████████████████████████████████████████████▊                                                                         | 289/751 [00:55<01:34,  4.91it/s, Materializing param=model.layers.18.mlp.experts.gate_up_proj]Loading weights:  38%|█████████████████████████████████████████████▊                                                                         | 289/751 [00:55<01:34,  4.91it/s, Materializing param=model.layers.18.mlp.experts.gate_up_proj]Loading weights:  39%|█████████████████████████████████████████████▉                                                                         | 290/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.experts.gate_up_proj]Loading weights:  39%|██████████████████████████████████████████▊                                                                    | 290/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.e_score_correction_bias]Loading weights:  39%|██████████████████████████████████████████▊                                                                    | 290/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.e_score_correction_bias]Loading weights:  39%|█████████████████████████████████████████████████▌                                                                              | 291/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.weight]Loading weights:  39%|█████████████████████████████████████████████████▌                                                                              | 291/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.weight]Loading weights:  39%|█████████████████████████████████████████▉                                                                  | 292/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.down_proj.weight]Loading weights:  39%|█████████████████████████████████████████▉                                                                  | 292/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.down_proj.weight]Loading weights:  39%|██████████████████████████████████████████▏                                                                 | 293/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.gate_proj.weight]Loading weights:  39%|██████████████████████████████████████████▏                                                                 | 293/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.gate_proj.weight]Loading weights:  39%|███████████████████████████████████████████                                                                   | 294/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.up_proj.weight]Loading weights:  39%|███████████████████████████████████████████                                                                   | 294/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.up_proj.weight]Loading weights:  39%|███████████████████████████████████████████▉                                                                    | 295/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  39%|███████████████████████████████████████████▉                                                                    | 295/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  39%|████████████████████████████████████████████▏                                                                   | 296/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_layernorm.weight]Loading weights:  39%|████████████████████████████████████████████▏                                                                   | 296/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_layernorm.weight]Loading weights:  40%|██████████████████████████████████████████▋                                                                 | 297/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  40%|██████████████████████████████████████████▋                                                                 | 297/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  40%|██████████████████████████████████████████████▍                                                                      | 298/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_b_proj.weight]Loading weights:  40%|██████████████████████████████████████████████▍                                                                      | 298/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_b_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▊                                                                        | 299/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▊                                                                        | 299/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  40%|█████████████████████████████████████████████▏                                                                   | 300/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_layernorm.weight]Loading weights:  40%|█████████████████████████████████████████████▏                                                                   | 300/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_layernorm.weight]Loading weights:  40%|███████████████████████████████████████████████▎                                                                      | 301/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▎                                                                      | 301/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▍                                                                      | 302/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.18.self_attn.q_b_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▍                                                                      | 302/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.18.self_attn.q_b_proj.weight]Loading weights:  40%|████████████████████████████████████████████████▊                                                                        | 303/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  40%|████████████████████████████████████████████████▊                                                                        | 303/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  39%|█████████████████████████████████████████████▉                                                                         | 290/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.experts.gate_up_proj]Loading weights:  40%|█████████████████████████████████████████████████▍                                                                        | 304/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.mlp.experts.down_proj]Loading weights:  40%|█████████████████████████████████████████████████▍                                                                        | 304/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.mlp.experts.down_proj]Loading weights:  39%|██████████████████████████████████████████▊                                                                    | 290/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.e_score_correction_bias]Loading weights:  39%|██████████████████████████████████████████▊                                                                    | 290/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.e_score_correction_bias]Loading weights:  39%|█████████████████████████████████████████████████▌                                                                              | 291/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.weight]Loading weights:  39%|█████████████████████████████████████████████████▌                                                                              | 291/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.gate.weight]Loading weights:  39%|█████████████████████████████████████████▉                                                                  | 292/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.down_proj.weight]Loading weights:  39%|█████████████████████████████████████████▉                                                                  | 292/751 [00:58<02:32,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.down_proj.weight]Loading weights:  39%|██████████████████████████████████████████▏                                                                 | 293/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.gate_proj.weight]Loading weights:  39%|██████████████████████████████████████████▏                                                                 | 293/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.gate_proj.weight]Loading weights:  39%|███████████████████████████████████████████                                                                   | 294/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.up_proj.weight]Loading weights:  39%|███████████████████████████████████████████                                                                   | 294/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.mlp.shared_experts.up_proj.weight]Loading weights:  39%|███████████████████████████████████████████▉                                                                    | 295/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  39%|███████████████████████████████████████████▉                                                                    | 295/751 [00:58<02:31,  3.02it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  39%|████████████████████████████████████████████▏                                                                   | 296/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_layernorm.weight]Loading weights:  39%|████████████████████████████████████████████▏                                                                   | 296/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_layernorm.weight]Loading weights:  40%|██████████████████████████████████████████▋                                                                 | 297/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  40%|██████████████████████████████████████████▋                                                                 | 297/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  40%|██████████████████████████████████████████████▍                                                                      | 298/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_b_proj.weight]Loading weights:  40%|██████████████████████████████████████████████▍                                                                      | 298/751 [00:58<02:30,  3.02it/s, Materializing param=model.layers.18.self_attn.kv_b_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▊                                                                        | 299/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▊                                                                        | 299/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  40%|█████████████████████████████████████████████▏                                                                   | 300/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_layernorm.weight]Loading weights:  40%|█████████████████████████████████████████████▏                                                                   | 300/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_layernorm.weight]Loading weights:  40%|███████████████████████████████████████████████▎                                                                      | 301/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▎                                                                      | 301/751 [00:58<02:29,  3.02it/s, Materializing param=model.layers.18.self_attn.q_a_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▍                                                                      | 302/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.18.self_attn.q_b_proj.weight]Loading weights:  40%|███████████████████████████████████████████████▍                                                                      | 302/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.18.self_attn.q_b_proj.weight]Loading weights:  40%|████████████████████████████████████████████████▊                                                                        | 303/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  40%|████████████████████████████████████████████████▊                                                                        | 303/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  40%|█████████████████████████████████████████████████▍                                                                        | 304/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.mlp.experts.down_proj]Loading weights:  40%|█████████████████████████████████████████████████▍                                                                        | 304/751 [00:58<02:28,  3.02it/s, Materializing param=model.layers.19.mlp.experts.down_proj]Loading weights:  41%|█████████████████████████████████████████████████▌                                                                        | 305/751 [00:59<01:35,  4.65it/s, Materializing param=model.layers.19.mlp.experts.down_proj]Loading weights:  41%|█████████████████████████████████████████████████▌                                                                        | 305/751 [01:00<01:35,  4.65it/s, Materializing param=model.layers.19.mlp.experts.down_proj]Loading weights:  41%|████████████████████████████████████████████████▎                                                                      | 305/751 [00:59<01:35,  4.65it/s, Materializing param=model.layers.19.mlp.experts.gate_up_proj]Loading weights:  41%|████████████████████████████████████████████████▎                                                                      | 305/751 [01:00<01:35,  4.65it/s, Materializing param=model.layers.19.mlp.experts.gate_up_proj]Loading weights:  41%|████████████████████████████████████████████████▎                                                                      | 305/751 [00:59<01:35,  4.65it/s, Materializing param=model.layers.19.mlp.experts.gate_up_proj]Loading weights:  41%|████████████████████████████████████████████████▎                                                                      | 305/751 [01:00<01:35,  4.65it/s, Materializing param=model.layers.19.mlp.experts.gate_up_proj]Loading weights:  41%|████████████████████████████████████████████████▍                                                                      | 306/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.experts.gate_up_proj]Loading weights:  41%|████████████████████████████████████████████████▍                                                                      | 306/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.experts.gate_up_proj]Loading weights:  41%|█████████████████████████████████████████████▏                                                                 | 306/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.e_score_correction_bias]Loading weights:  41%|█████████████████████████████████████████████▏                                                                 | 306/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.e_score_correction_bias]Loading weights:  41%|█████████████████████████████████████████████▏                                                                 | 306/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.e_score_correction_bias]Loading weights:  41%|█████████████████████████████████████████████▏                                                                 | 306/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.e_score_correction_bias]Loading weights:  41%|████████████████████████████████████████████████████▎                                                                           | 307/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.weight]Loading weights:  41%|████████████████████████████████████████████████████▎                                                                           | 307/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.weight]Loading weights:  41%|████████████████████████████████████████████████████▎                                                                           | 307/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.weight]Loading weights:  41%|████████████████████████████████████████████████████▎                                                                           | 307/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.gate.weight]Loading weights:  41%|████████████████████████████████████████████▎                                                               | 308/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.down_proj.weight]Loading weights:  41%|████████████████████████████████████████████▎                                                               | 308/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.down_proj.weight]Loading weights:  41%|████████████████████████████████████████████▎                                                               | 308/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.down_proj.weight]Loading weights:  41%|████████████████████████████████████████████▎                                                               | 308/751 [01:03<02:34,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.down_proj.weight]Loading weights:  41%|████████████████████████████████████████████▍                                                               | 309/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.gate_proj.weight]Loading weights:  41%|████████████████████████████████████████████▍                                                               | 309/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.gate_proj.weight]Loading weights:  41%|████████████████████████████████████████████▍                                                               | 309/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.gate_proj.weight]Loading weights:  41%|█████████████████████████████████████████████▍                                                                | 310/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.up_proj.weight]Loading weights:  41%|████████████████████████████████████████████▍                                                               | 309/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.gate_proj.weight]Loading weights:  41%|█████████████████████████████████████████████▍                                                                | 310/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.up_proj.weight]Loading weights:  41%|█████████████████████████████████████████████▍                                                                | 310/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.up_proj.weight]Loading weights:  41%|██████████████████████████████████████████████▍                                                                 | 311/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  41%|█████████████████████████████████████████████▍                                                                | 310/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.mlp.shared_experts.up_proj.weight]Loading weights:  41%|██████████████████████████████████████████████▍                                                                 | 311/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  41%|██████████████████████████████████████████████▍                                                                 | 311/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  42%|██████████████████████████████████████████████▌                                                                 | 312/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_layernorm.weight]Loading weights:  41%|██████████████████████████████████████████████▍                                                                 | 311/751 [01:03<02:33,  2.87it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  42%|██████████████████████████████████████████████▌                                                                 | 312/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_layernorm.weight]Loading weights:  42%|██████████████████████████████████████████████▌                                                                 | 312/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_layernorm.weight]Loading weights:  42%|█████████████████████████████████████████████                                                               | 313/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  42%|██████████████████████████████████████████████▌                                                                 | 312/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_layernorm.weight]Loading weights:  42%|█████████████████████████████████████████████                                                               | 313/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  42%|█████████████████████████████████████████████                                                               | 313/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  42%|████████████████████████████████████████████████▉                                                                    | 314/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_b_proj.weight]Loading weights:  42%|█████████████████████████████████████████████                                                               | 313/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  42%|████████████████████████████████████████████████▉                                                                    | 314/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_b_proj.weight]Loading weights:  42%|████████████████████████████████████████████████▉                                                                    | 314/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_b_proj.weight]Loading weights:  42%|██████████████████████████████████████████████████▎                                                                     | 315/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  42%|████████████████████████████████████████████████▉                                                                    | 314/751 [01:03<02:32,  2.87it/s, Materializing param=model.layers.19.self_attn.kv_b_proj.weight]Loading weights:  42%|██████████████████████████████████████████████████▎                                                                     | 315/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  42%|██████████████████████████████████████████████████▎                                                                     | 315/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  42%|██████████████████████████████████████████████████▎                                                                     | 315/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  42%|███████████████████████████████████████████████▌                                                                 | 316/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_layernorm.weight]Loading weights:  42%|███████████████████████████████████████████████▌                                                                 | 316/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_layernorm.weight]Loading weights:  42%|███████████████████████████████████████████████▌                                                                 | 316/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_layernorm.weight]Loading weights:  42%|███████████████████████████████████████████████▌                                                                 | 316/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_layernorm.weight]Loading weights:  42%|█████████████████████████████████████████████████▊                                                                    | 317/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▊                                                                    | 317/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▊                                                                    | 317/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▊                                                                    | 317/751 [01:03<02:31,  2.87it/s, Materializing param=model.layers.19.self_attn.q_a_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▉                                                                    | 318/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.19.self_attn.q_b_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▉                                                                    | 318/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.19.self_attn.q_b_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▉                                                                    | 318/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.19.self_attn.q_b_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████▉                                                                    | 318/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.19.self_attn.q_b_proj.weight]Loading weights:  42%|███████████████████████████████████████████████████▍                                                                     | 319/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  42%|███████████████████████████████████████████████████▍                                                                     | 319/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  42%|███████████████████████████████████████████████████▍                                                                     | 319/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  42%|███████████████████████████████████████████████████▍                                                                     | 319/751 [01:03<02:30,  2.87it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  43%|███████████████████████████████████████████████████▉                                                                      | 320/751 [01:03<02:29,  2.87it/s, Materializing param=model.layers.20.mlp.experts.down_proj]Loading weights:  43%|███████████████████████████████████████████████████▉                                                                      | 320/751 [01:03<02:29,  2.87it/s, Materializing param=model.layers.20.mlp.experts.down_proj]Loading weights:  43%|███████████████████████████████████████████████████▉                                                                      | 320/751 [01:03<02:29,  2.87it/s, Materializing param=model.layers.20.mlp.experts.down_proj]Loading weights:  43%|███████████████████████████████████████████████████▉                                                                      | 320/751 [01:03<02:29,  2.87it/s, Materializing param=model.layers.20.mlp.experts.down_proj]Loading weights:  43%|████████████████████████████████████████████████████▏                                                                     | 321/751 [01:04<01:37,  4.40it/s, Materializing param=model.layers.20.mlp.experts.down_proj]Loading weights:  43%|████████████████████████████████████████████████████▏                                                                     | 321/751 [01:04<01:37,  4.40it/s, Materializing param=model.layers.20.mlp.experts.down_proj]Loading weights:  43%|██████████████████████████████████████████████████▊                                                                    | 321/751 [01:04<01:37,  4.40it/s, Materializing param=model.layers.20.mlp.experts.gate_up_proj]Loading weights:  43%|██████████████████████████████████████████████████▊                                                                    | 321/751 [01:04<01:37,  4.40it/s, Materializing param=model.layers.20.mlp.experts.gate_up_proj]Loading weights:  43%|██████████████████████████████████████████████████▊                                                                    | 321/751 [01:04<01:37,  4.40it/s, Materializing param=model.layers.20.mlp.experts.gate_up_proj]Loading weights:  43%|██████████████████████████████████████████████████▊                                                                    | 321/751 [01:04<01:37,  4.40it/s, Materializing param=model.layers.20.mlp.experts.gate_up_proj]Loading weights:  43%|███████████████████████████████████████████████████                                                                    | 322/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.experts.gate_up_proj]Loading weights:  43%|███████████████████████████████████████████████▌                                                               | 322/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.e_score_correction_bias]Loading weights:  43%|███████████████████████████████████████████████▌                                                               | 322/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.e_score_correction_bias]Loading weights:  43%|███████████████████████████████████████████████████████                                                                         | 323/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.weight]Loading weights:  43%|███████████████████████████████████████████████████████                                                                         | 323/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.weight]Loading weights:  43%|██████████████████████████████████████████████▌                                                             | 324/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.down_proj.weight]Loading weights:  43%|██████████████████████████████████████████████▌                                                             | 324/751 [01:07<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.down_proj.weight]Loading weights:  43%|██████████████████████████████████████████████▋                                                             | 325/751 [01:07<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.gate_proj.weight]Loading weights:  43%|██████████████████████████████████████████████▋                                                             | 325/751 [01:07<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.gate_proj.weight]Loading weights:  43%|███████████████████████████████████████████████▋                                                              | 326/751 [01:07<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.up_proj.weight]Loading weights:  43%|███████████████████████████████████████████████▋                                                              | 326/751 [01:07<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.up_proj.weight]Loading weights:  44%|████████████████████████████████████████████████▊                                                               | 327/751 [01:07<02:33,  2.77it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████▊                                                               | 327/751 [01:07<02:33,  2.77it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████▉                                                               | 328/751 [01:07<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████▉                                                               | 328/751 [01:07<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_layernorm.weight]Loading weights:  44%|███████████████████████████████████████████████▎                                                            | 329/751 [01:07<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  44%|███████████████████████████████████████████████▎                                                            | 329/751 [01:07<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  44%|███████████████████████████████████████████████████▍                                                                 | 330/751 [01:07<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_b_proj.weight]Loading weights:  44%|███████████████████████████████████████████████████▍                                                                 | 330/751 [01:07<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_b_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▉                                                                   | 331/751 [01:07<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  43%|███████████████████████████████████████████████████                                                                    | 322/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.experts.gate_up_proj]Loading weights:  44%|████████████████████████████████████████████████████▉                                                                   | 331/751 [01:07<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  43%|███████████████████████████████████████████████▌                                                               | 322/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.e_score_correction_bias]Loading weights:  44%|█████████████████████████████████████████████████▉                                                               | 332/751 [01:07<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_layernorm.weight]Loading weights:  43%|███████████████████████████████████████████████▌                                                               | 322/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.e_score_correction_bias]Loading weights:  44%|█████████████████████████████████████████████████▉                                                               | 332/751 [01:07<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_layernorm.weight]Loading weights:  43%|███████████████████████████████████████████████████████                                                                         | 323/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.weight]Loading weights:  44%|████████████████████████████████████████████████████▎                                                                 | 333/751 [01:07<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_proj.weight]Loading weights:  43%|███████████████████████████████████████████████████████                                                                         | 323/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.gate.weight]Loading weights:  44%|████████████████████████████████████████████████████▎                                                                 | 333/751 [01:07<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_proj.weight]Loading weights:  43%|██████████████████████████████████████████████▌                                                             | 324/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.down_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▍                                                                 | 334/751 [01:07<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_b_proj.weight]Loading weights:  43%|██████████████████████████████████████████████▌                                                             | 324/751 [01:08<02:34,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.down_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▍                                                                 | 334/751 [01:07<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_b_proj.weight]Loading weights:  43%|██████████████████████████████████████████████▋                                                             | 325/751 [01:08<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.gate_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████████▉                                                                   | 335/751 [01:07<02:30,  2.77it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  43%|██████████████████████████████████████████████▋                                                             | 325/751 [01:08<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.gate_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████████▉                                                                   | 335/751 [01:07<02:30,  2.77it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  43%|███████████████████████████████████████████████▋                                                              | 326/751 [01:08<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.up_proj.weight]Loading weights:  43%|███████████████████████████████████████████████▋                                                              | 326/751 [01:08<02:33,  2.77it/s, Materializing param=model.layers.20.mlp.shared_experts.up_proj.weight]Loading weights:  45%|██████████████████████████████████████████████████████▌                                                                   | 336/751 [01:07<02:29,  2.77it/s, Materializing param=model.layers.21.mlp.experts.down_proj]Loading weights:  45%|██████████████████████████████████████████████████████▌                                                                   | 336/751 [01:07<02:29,  2.77it/s, Materializing param=model.layers.21.mlp.experts.down_proj]Loading weights:  44%|████████████████████████████████████████████████▊                                                               | 327/751 [01:08<02:33,  2.77it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████▊                                                               | 327/751 [01:08<02:33,  2.77it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████▉                                                               | 328/751 [01:08<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████▉                                                               | 328/751 [01:08<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_layernorm.weight]Loading weights:  44%|███████████████████████████████████████████████▎                                                            | 329/751 [01:08<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  44%|███████████████████████████████████████████████▎                                                            | 329/751 [01:08<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  44%|███████████████████████████████████████████████████▍                                                                 | 330/751 [01:08<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_b_proj.weight]Loading weights:  44%|███████████████████████████████████████████████████▍                                                                 | 330/751 [01:08<02:32,  2.77it/s, Materializing param=model.layers.20.self_attn.kv_b_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▉                                                                   | 331/751 [01:08<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▉                                                                   | 331/751 [01:08<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  44%|█████████████████████████████████████████████████▉                                                               | 332/751 [01:08<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_layernorm.weight]Loading weights:  44%|█████████████████████████████████████████████████▉                                                               | 332/751 [01:08<02:31,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_layernorm.weight]Loading weights:  44%|████████████████████████████████████████████████████▎                                                                 | 333/751 [01:08<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▎                                                                 | 333/751 [01:08<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_a_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▍                                                                 | 334/751 [01:08<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_b_proj.weight]Loading weights:  44%|████████████████████████████████████████████████████▍                                                                 | 334/751 [01:08<02:30,  2.77it/s, Materializing param=model.layers.20.self_attn.q_b_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████████▉                                                                   | 335/751 [01:08<02:30,  2.77it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  45%|█████████████████████████████████████████████████████▉                                                                   | 335/751 [01:08<02:30,  2.77it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  45%|██████████████████████████████████████████████████████▌                                                                   | 336/751 [01:08<02:29,  2.77it/s, Materializing param=model.layers.21.mlp.experts.down_proj]Loading weights:  45%|██████████████████████████████████████████████████████▌                                                                   | 336/751 [01:08<02:29,  2.77it/s, Materializing param=model.layers.21.mlp.experts.down_proj]Loading weights:  45%|██████████████████████████████████████████████████████▋                                                                   | 337/751 [01:09<01:37,  4.25it/s, Materializing param=model.layers.21.mlp.experts.down_proj]Loading weights:  45%|██████████████████████████████████████████████████████▋                                                                   | 337/751 [01:09<01:37,  4.25it/s, Materializing param=model.layers.21.mlp.experts.down_proj]Loading weights:  45%|█████████████████████████████████████████████████████▍                                                                 | 337/751 [01:09<01:37,  4.25it/s, Materializing param=model.layers.21.mlp.experts.gate_up_proj]Loading weights:  45%|█████████████████████████████████████████████████████▍                                                                 | 337/751 [01:09<01:37,  4.25it/s, Materializing param=model.layers.21.mlp.experts.gate_up_proj]Loading weights:  45%|█████████████████████████████████████████████████████▍                                                                 | 337/751 [01:09<01:37,  4.25it/s, Materializing param=model.layers.21.mlp.experts.gate_up_proj]Loading weights:  45%|█████████████████████████████████████████████████████▍                                                                 | 337/751 [01:09<01:37,  4.25it/s, Materializing param=model.layers.21.mlp.experts.gate_up_proj]Loading weights:  45%|█████████████████████████████████████████████████████▌                                                                 | 338/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.experts.gate_up_proj]Loading weights:  45%|█████████████████████████████████████████████████▉                                                             | 338/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.e_score_correction_bias]Loading weights:  45%|█████████████████████████████████████████████████▉                                                             | 338/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.e_score_correction_bias]Loading weights:  45%|█████████████████████████████████████████████████████████▊                                                                      | 339/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.weight]Loading weights:  45%|█████████████████████████████████████████████████████████▊                                                                      | 339/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.weight]Loading weights:  45%|████████████████████████████████████████████████▉                                                           | 340/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.down_proj.weight]Loading weights:  45%|████████████████████████████████████████████████▉                                                           | 340/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.down_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████                                                           | 341/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.gate_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████                                                           | 341/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.gate_proj.weight]Loading weights:  46%|██████████████████████████████████████████████████                                                            | 342/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.up_proj.weight]Loading weights:  46%|██████████████████████████████████████████████████                                                            | 342/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.up_proj.weight]Loading weights:  46%|███████████████████████████████████████████████████▏                                                            | 343/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  46%|███████████████████████████████████████████████████▏                                                            | 343/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  46%|███████████████████████████████████████████████████▎                                                            | 344/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_layernorm.weight]Loading weights:  46%|███████████████████████████████████████████████████▎                                                            | 344/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_layernorm.weight]Loading weights:  46%|█████████████████████████████████████████████████▌                                                          | 345/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  46%|█████████████████████████████████████████████████▌                                                          | 345/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  46%|█████████████████████████████████████████████████████▉                                                               | 346/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_b_proj.weight]Loading weights:  46%|█████████████████████████████████████████████████████▉                                                               | 346/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_b_proj.weight]Loading weights:  46%|███████████████████████████████████████████████████████▍                                                                | 347/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  46%|███████████████████████████████████████████████████████▍                                                                | 347/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  46%|████████████████████████████████████████████████████▎                                                            | 348/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_layernorm.weight]Loading weights:  46%|████████████████████████████████████████████████████▎                                                            | 348/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_layernorm.weight]Loading weights:  46%|██████████████████████████████████████████████████████▊                                                               | 349/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_proj.weight]Loading weights:  46%|██████████████████████████████████████████████████████▊                                                               | 349/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_proj.weight]Loading weights:  47%|██████████████████████████████████████████████████████▉                                                               | 350/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.21.self_attn.q_b_proj.weight]Loading weights:  47%|██████████████████████████████████████████████████████▉                                                               | 350/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.21.self_attn.q_b_proj.weight]Loading weights:  47%|████████████████████████████████████████████████████████▌                                                                | 351/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  47%|████████████████████████████████████████████████████████▌                                                                | 351/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  47%|█████████████████████████████████████████████████████████▏                                                                | 352/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.mlp.experts.down_proj]Loading weights:  47%|█████████████████████████████████████████████████████████▏                                                                | 352/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.mlp.experts.down_proj]Loading weights:  45%|█████████████████████████████████████████████████████▌                                                                 | 338/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.experts.gate_up_proj]Loading weights:  45%|█████████████████████████████████████████████████▉                                                             | 338/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.e_score_correction_bias]Loading weights:  45%|█████████████████████████████████████████████████▉                                                             | 338/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.e_score_correction_bias]Loading weights:  45%|█████████████████████████████████████████████████████████▊                                                                      | 339/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.weight]Loading weights:  45%|█████████████████████████████████████████████████████████▊                                                                      | 339/751 [01:12<02:33,  2.69it/s, Materializing param=model.layers.21.mlp.gate.weight]Loading weights:  45%|████████████████████████████████████████████████▉                                                           | 340/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.down_proj.weight]Loading weights:  45%|████████████████████████████████████████████████▉                                                           | 340/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.down_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████                                                           | 341/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.gate_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████                                                           | 341/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.gate_proj.weight]Loading weights:  46%|██████████████████████████████████████████████████                                                            | 342/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.up_proj.weight]Loading weights:  46%|██████████████████████████████████████████████████                                                            | 342/751 [01:12<02:32,  2.69it/s, Materializing param=model.layers.21.mlp.shared_experts.up_proj.weight]Loading weights:  46%|███████████████████████████████████████████████████▏                                                            | 343/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  46%|███████████████████████████████████████████████████▏                                                            | 343/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  46%|███████████████████████████████████████████████████▎                                                            | 344/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_layernorm.weight]Loading weights:  46%|███████████████████████████████████████████████████▎                                                            | 344/751 [01:12<02:31,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_layernorm.weight]Loading weights:  46%|█████████████████████████████████████████████████▌                                                          | 345/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  46%|█████████████████████████████████████████████████▌                                                          | 345/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  46%|█████████████████████████████████████████████████████▉                                                               | 346/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_b_proj.weight]Loading weights:  46%|█████████████████████████████████████████████████████▉                                                               | 346/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.kv_b_proj.weight]Loading weights:  46%|███████████████████████████████████████████████████████▍                                                                | 347/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  46%|███████████████████████████████████████████████████████▍                                                                | 347/751 [01:12<02:30,  2.69it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  46%|████████████████████████████████████████████████████▎                                                            | 348/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_layernorm.weight]Loading weights:  46%|████████████████████████████████████████████████████▎                                                            | 348/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_layernorm.weight]Loading weights:  46%|██████████████████████████████████████████████████████▊                                                               | 349/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_proj.weight]Loading weights:  46%|██████████████████████████████████████████████████████▊                                                               | 349/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_a_proj.weight]Loading weights:  47%|██████████████████████████████████████████████████████▉                                                               | 350/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_b_proj.weight]Loading weights:  47%|██████████████████████████████████████████████████████▉                                                               | 350/751 [01:12<02:29,  2.69it/s, Materializing param=model.layers.21.self_attn.q_b_proj.weight]Loading weights:  47%|████████████████████████████████████████████████████████▌                                                                | 351/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  47%|████████████████████████████████████████████████████████▌                                                                | 351/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  47%|█████████████████████████████████████████████████████████▏                                                                | 352/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.mlp.experts.down_proj]Loading weights:  47%|█████████████████████████████████████████████████████████▏                                                                | 352/751 [01:12<02:28,  2.69it/s, Materializing param=model.layers.22.mlp.experts.down_proj]Loading weights:  47%|█████████████████████████████████████████████████████████▎                                                                | 353/751 [01:14<01:35,  4.17it/s, Materializing param=model.layers.22.mlp.experts.down_proj]Loading weights:  47%|█████████████████████████████████████████████████████████▎                                                                | 353/751 [01:14<01:35,  4.17it/s, Materializing param=model.layers.22.mlp.experts.down_proj]Loading weights:  47%|███████████████████████████████████████████████████████▉                                                               | 353/751 [01:14<01:35,  4.17it/s, Materializing param=model.layers.22.mlp.experts.gate_up_proj]Loading weights:  47%|███████████████████████████████████████████████████████▉                                                               | 353/751 [01:14<01:35,  4.17it/s, Materializing param=model.layers.22.mlp.experts.gate_up_proj]Loading weights:  47%|███████████████████████████████████████████████████████▉                                                               | 353/751 [01:14<01:35,  4.17it/s, Materializing param=model.layers.22.mlp.experts.gate_up_proj]Loading weights:  47%|███████████████████████████████████████████████████████▉                                                               | 353/751 [01:14<01:35,  4.17it/s, Materializing param=model.layers.22.mlp.experts.gate_up_proj]Loading weights:  47%|████████████████████████████████████████████████████████                                                               | 354/751 [01:17<02:29,  2.66it/s, Materializing param=model.layers.22.mlp.experts.gate_up_proj]Loading weights:  47%|████████████████████████████████████████████████████▎                                                          | 354/751 [01:17<02:29,  2.66it/s, Materializing param=model.layers.22.mlp.gate.e_score_correction_bias]Loading weights:  47%|████████████████████████████████████████████████████▎                                                          | 354/751 [01:17<02:29,  2.66it/s, Materializing param=model.layers.22.mlp.gate.e_score_correction_bias]Loading weights:  47%|████████████████████████████████████████████████████████████▌                                                                   | 355/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.gate.weight]Loading weights:  47%|████████████████████████████████████████████████████████████▌                                                                   | 355/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.gate.weight]Loading weights:  47%|███████████████████████████████████████████████████▏                                                        | 356/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.down_proj.weight]Loading weights:  47%|███████████████████████████████████████████████████▏                                                        | 356/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.down_proj.weight]Loading weights:  47%|████████████████████████████████████████████████████████                                                               | 354/751 [01:17<02:29,  2.66it/s, Materializing param=model.layers.22.mlp.experts.gate_up_proj]Loading weights:  48%|███████████████████████████████████████████████████▎                                                        | 357/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.gate_proj.weight]Loading weights:  48%|███████████████████████████████████████████████████▎                                                        | 357/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.gate_proj.weight]Loading weights:  47%|████████████████████████████████████████████████████▎                                                          | 354/751 [01:17<02:29,  2.66it/s, Materializing param=model.layers.22.mlp.gate.e_score_correction_bias]Loading weights:  47%|████████████████████████████████████████████████████▎                                                          | 354/751 [01:17<02:29,  2.66it/s, Materializing param=model.layers.22.mlp.gate.e_score_correction_bias]Loading weights:  48%|████████████████████████████████████████████████████▍                                                         | 358/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.up_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████▍                                                         | 358/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.up_proj.weight]Loading weights:  47%|████████████████████████████████████████████████████████████▌                                                                   | 355/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.gate.weight]Loading weights:  48%|█████████████████████████████████████████████████████▌                                                          | 359/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  47%|████████████████████████████████████████████████████████████▌                                                                   | 355/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.gate.weight]Loading weights:  48%|█████████████████████████████████████████████████████▌                                                          | 359/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  47%|███████████████████████████████████████████████████▏                                                        | 356/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.down_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████▋                                                          | 360/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_layernorm.weight]Loading weights:  47%|███████████████████████████████████████████████████▏                                                        | 356/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.down_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████▋                                                          | 360/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_layernorm.weight]Loading weights:  48%|███████████████████████████████████████████████████▎                                                        | 357/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.gate_proj.weight]Loading weights:  48%|███████████████████████████████████████████████████▉                                                        | 361/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  48%|███████████████████████████████████████████████████▎                                                        | 357/751 [01:17<02:28,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.gate_proj.weight]Loading weights:  48%|███████████████████████████████████████████████████▉                                                        | 361/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  48%|████████████████████████████████████████████████████▍                                                         | 358/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.up_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████████▍                                                            | 362/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_b_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████▍                                                         | 358/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.mlp.shared_experts.up_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████████▍                                                            | 362/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_b_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████▌                                                          | 359/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  48%|██████████████████████████████████████████████████████████                                                              | 363/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████▌                                                          | 359/751 [01:17<02:27,  2.66it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  48%|██████████████████████████████████████████████████████████                                                              | 363/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████▋                                                          | 360/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_layernorm.weight]Loading weights:  48%|██████████████████████████████████████████████████████▊                                                          | 364/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_layernorm.weight]Loading weights:  48%|█████████████████████████████████████████████████████▋                                                          | 360/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_layernorm.weight]Loading weights:  48%|██████████████████████████████████████████████████████▊                                                          | 364/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_layernorm.weight]Loading weights:  48%|███████████████████████████████████████████████████▉                                                        | 361/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▎                                                            | 365/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_proj.weight]Loading weights:  48%|███████████████████████████████████████████████████▉                                                        | 361/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▎                                                            | 365/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████████▍                                                            | 362/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_b_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████████▍                                                            | 362/751 [01:17<02:26,  2.66it/s, Materializing param=model.layers.22.self_attn.kv_b_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▌                                                            | 366/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.22.self_attn.q_b_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▌                                                            | 366/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.22.self_attn.q_b_proj.weight]Loading weights:  48%|██████████████████████████████████████████████████████████                                                              | 363/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  48%|██████████████████████████████████████████████████████████                                                              | 363/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  49%|███████████████████████████████████████████████████████████▏                                                             | 367/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  49%|███████████████████████████████████████████████████████████▏                                                             | 367/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  48%|██████████████████████████████████████████████████████▊                                                          | 364/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_layernorm.weight]Loading weights:  48%|██████████████████████████████████████████████████████▊                                                          | 364/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_layernorm.weight]Loading weights:  49%|███████████████████████████████████████████████████████████▊                                                              | 368/751 [01:17<02:23,  2.66it/s, Materializing param=model.layers.23.mlp.experts.down_proj]Loading weights:  49%|███████████████████████████████████████████████████████████▊                                                              | 368/751 [01:17<02:23,  2.66it/s, Materializing param=model.layers.23.mlp.experts.down_proj]Loading weights:  49%|█████████████████████████████████████████████████████████▎                                                            | 365/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▎                                                            | 365/751 [01:17<02:25,  2.66it/s, Materializing param=model.layers.22.self_attn.q_a_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▌                                                            | 366/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.22.self_attn.q_b_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████▌                                                            | 366/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.22.self_attn.q_b_proj.weight]Loading weights:  49%|███████████████████████████████████████████████████████████▏                                                             | 367/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  49%|███████████████████████████████████████████████████████████▏                                                             | 367/751 [01:17<02:24,  2.66it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  49%|███████████████████████████████████████████████████████████▊                                                              | 368/751 [01:17<02:23,  2.66it/s, Materializing param=model.layers.23.mlp.experts.down_proj]Loading weights:  49%|███████████████████████████████████████████████████████████▊                                                              | 368/751 [01:17<02:23,  2.66it/s, Materializing param=model.layers.23.mlp.experts.down_proj]Loading weights:  49%|███████████████████████████████████████████████████████████▉                                                              | 369/751 [01:19<01:32,  4.11it/s, Materializing param=model.layers.23.mlp.experts.down_proj]Loading weights:  49%|███████████████████████████████████████████████████████████▉                                                              | 369/751 [01:19<01:32,  4.12it/s, Materializing param=model.layers.23.mlp.experts.down_proj]Loading weights:  49%|██████████████████████████████████████████████████████████▍                                                            | 369/751 [01:19<01:32,  4.11it/s, Materializing param=model.layers.23.mlp.experts.gate_up_proj]Loading weights:  49%|██████████████████████████████████████████████████████████▍                                                            | 369/751 [01:19<01:32,  4.12it/s, Materializing param=model.layers.23.mlp.experts.gate_up_proj]Loading weights:  49%|██████████████████████████████████████████████████████████▍                                                            | 369/751 [01:19<01:32,  4.11it/s, Materializing param=model.layers.23.mlp.experts.gate_up_proj]Loading weights:  49%|██████████████████████████████████████████████████████████▍                                                            | 369/751 [01:19<01:32,  4.12it/s, Materializing param=model.layers.23.mlp.experts.gate_up_proj]Loading weights:  49%|██████████████████████████████████████████████████████████▋                                                            | 370/751 [01:22<02:26,  2.60it/s, Materializing param=model.layers.23.mlp.experts.gate_up_proj]Loading weights:  49%|██████████████████████████████████████████████████████▋                                                        | 370/751 [01:22<02:26,  2.60it/s, Materializing param=model.layers.23.mlp.gate.e_score_correction_bias]Loading weights:  49%|██████████████████████████████████████████████████████▋                                                        | 370/751 [01:22<02:26,  2.60it/s, Materializing param=model.layers.23.mlp.gate.e_score_correction_bias]Loading weights:  49%|███████████████████████████████████████████████████████████████▏                                                                | 371/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.gate.weight]Loading weights:  49%|███████████████████████████████████████████████████████████████▏                                                                | 371/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.gate.weight]Loading weights:  50%|█████████████████████████████████████████████████████▍                                                      | 372/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.down_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████▍                                                      | 372/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.down_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████▋                                                      | 373/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.gate_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████▋                                                      | 373/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.gate_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████▊                                                       | 374/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.up_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████▊                                                       | 374/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.up_proj.weight]Loading weights:  50%|███████████████████████████████████████████████████████▉                                                        | 375/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  50%|███████████████████████████████████████████████████████▉                                                        | 375/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  50%|████████████████████████████████████████████████████████                                                        | 376/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_layernorm.weight]Loading weights:  50%|████████████████████████████████████████████████████████                                                        | 376/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_layernorm.weight]Loading weights:  50%|██████████████████████████████████████████████████████▏                                                     | 377/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  50%|██████████████████████████████████████████████████████▏                                                     | 377/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  50%|██████████████████████████████████████████████████████████▉                                                          | 378/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_b_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████████▉                                                          | 378/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_b_proj.weight]Loading weights:  50%|████████████████████████████████████████████████████████████▌                                                           | 379/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  50%|████████████████████████████████████████████████████████████▌                                                           | 379/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  51%|█████████████████████████████████████████████████████████▏                                                       | 380/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_layernorm.weight]Loading weights:  51%|█████████████████████████████████████████████████████████▏                                                       | 380/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_layernorm.weight]Loading weights:  51%|███████████████████████████████████████████████████████████▊                                                          | 381/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_proj.weight]Loading weights:  51%|███████████████████████████████████████████████████████████▊                                                          | 381/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_proj.weight]Loading weights:  51%|████████████████████████████████████████████████████████████                                                          | 382/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.23.self_attn.q_b_proj.weight]Loading weights:  51%|████████████████████████████████████████████████████████████                                                          | 382/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.23.self_attn.q_b_proj.weight]Loading weights:  51%|█████████████████████████████████████████████████████████████▋                                                           | 383/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  51%|█████████████████████████████████████████████████████████████▋                                                           | 383/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  51%|██████████████████████████████████████████████████████████████▍                                                           | 384/751 [01:22<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.experts.down_proj]Loading weights:  51%|██████████████████████████████████████████████████████████████▍                                                           | 384/751 [01:22<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.experts.down_proj]Loading weights:  49%|██████████████████████████████████████████████████████████▋                                                            | 370/751 [01:22<02:26,  2.60it/s, Materializing param=model.layers.23.mlp.experts.gate_up_proj]Loading weights:  49%|██████████████████████████████████████████████████████▋                                                        | 370/751 [01:22<02:26,  2.60it/s, Materializing param=model.layers.23.mlp.gate.e_score_correction_bias]Loading weights:  49%|██████████████████████████████████████████████████████▋                                                        | 370/751 [01:22<02:26,  2.60it/s, Materializing param=model.layers.23.mlp.gate.e_score_correction_bias]Loading weights:  49%|███████████████████████████████████████████████████████████████▏                                                                | 371/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.gate.weight]Loading weights:  49%|███████████████████████████████████████████████████████████████▏                                                                | 371/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.gate.weight]Loading weights:  50%|█████████████████████████████████████████████████████▍                                                      | 372/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.down_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████▍                                                      | 372/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.down_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████▋                                                      | 373/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.gate_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████▋                                                      | 373/751 [01:22<02:25,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.gate_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████▊                                                       | 374/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.up_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████▊                                                       | 374/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.mlp.shared_experts.up_proj.weight]Loading weights:  50%|███████████████████████████████████████████████████████▉                                                        | 375/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  50%|███████████████████████████████████████████████████████▉                                                        | 375/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  50%|████████████████████████████████████████████████████████                                                        | 376/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_layernorm.weight]Loading weights:  50%|████████████████████████████████████████████████████████                                                        | 376/751 [01:22<02:24,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_layernorm.weight]Loading weights:  50%|██████████████████████████████████████████████████████▏                                                     | 377/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  50%|██████████████████████████████████████████████████████▏                                                     | 377/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  50%|██████████████████████████████████████████████████████████▉                                                          | 378/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_b_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████████▉                                                          | 378/751 [01:22<02:23,  2.60it/s, Materializing param=model.layers.23.self_attn.kv_b_proj.weight]Loading weights:  50%|████████████████████████████████████████████████████████████▌                                                           | 379/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  50%|████████████████████████████████████████████████████████████▌                                                           | 379/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  51%|█████████████████████████████████████████████████████████▏                                                       | 380/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_layernorm.weight]Loading weights:  51%|█████████████████████████████████████████████████████████▏                                                       | 380/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_layernorm.weight]Loading weights:  51%|███████████████████████████████████████████████████████████▊                                                          | 381/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_proj.weight]Loading weights:  51%|███████████████████████████████████████████████████████████▊                                                          | 381/751 [01:22<02:22,  2.60it/s, Materializing param=model.layers.23.self_attn.q_a_proj.weight]Loading weights:  51%|████████████████████████████████████████████████████████████                                                          | 382/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.23.self_attn.q_b_proj.weight]Loading weights:  51%|████████████████████████████████████████████████████████████                                                          | 382/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.23.self_attn.q_b_proj.weight]Loading weights:  51%|█████████████████████████████████████████████████████████████▋                                                           | 383/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  51%|█████████████████████████████████████████████████████████████▋                                                           | 383/751 [01:22<02:21,  2.60it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  51%|██████████████████████████████████████████████████████████████▍                                                           | 384/751 [01:22<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.experts.down_proj]Loading weights:  51%|██████████████████████████████████████████████████████████████▍                                                           | 384/751 [01:22<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.experts.down_proj]Loading weights:  51%|██████████████████████████████████████████████████████████████▌                                                           | 385/751 [01:23<01:30,  4.05it/s, Materializing param=model.layers.24.mlp.experts.down_proj]Loading weights:  51%|██████████████████████████████████████████████████████████████▌                                                           | 385/751 [01:24<01:30,  4.05it/s, Materializing param=model.layers.24.mlp.experts.down_proj]Loading weights:  51%|█████████████████████████████████████████████████████████████                                                          | 385/751 [01:23<01:30,  4.05it/s, Materializing param=model.layers.24.mlp.experts.gate_up_proj]Loading weights:  51%|█████████████████████████████████████████████████████████████                                                          | 385/751 [01:24<01:30,  4.05it/s, Materializing param=model.layers.24.mlp.experts.gate_up_proj]Loading weights:  51%|█████████████████████████████████████████████████████████████                                                          | 385/751 [01:23<01:30,  4.05it/s, Materializing param=model.layers.24.mlp.experts.gate_up_proj]Loading weights:  51%|█████████████████████████████████████████████████████████████                                                          | 385/751 [01:24<01:30,  4.05it/s, Materializing param=model.layers.24.mlp.experts.gate_up_proj]Loading weights:  51%|█████████████████████████████████████████████████████████████▏                                                         | 386/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.experts.gate_up_proj]Loading weights:  51%|█████████████████████████████████████████████████████████                                                      | 386/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.gate.e_score_correction_bias]Loading weights:  51%|█████████████████████████████████████████████████████████                                                      | 386/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.gate.e_score_correction_bias]Loading weights:  52%|█████████████████████████████████████████████████████████████████▉                                                              | 387/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.gate.weight]Loading weights:  52%|█████████████████████████████████████████████████████████████████▉                                                              | 387/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.gate.weight]Loading weights:  52%|███████████████████████████████████████████████████████▊                                                    | 388/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.down_proj.weight]Loading weights:  52%|███████████████████████████████████████████████████████▊                                                    | 388/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.down_proj.weight]Loading weights:  52%|███████████████████████████████████████████████████████▉                                                    | 389/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.gate_proj.weight]Loading weights:  52%|███████████████████████████████████████████████████████▉                                                    | 389/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.gate_proj.weight]Loading weights:  52%|█████████████████████████████████████████████████████████                                                     | 390/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.up_proj.weight]Loading weights:  52%|█████████████████████████████████████████████████████████                                                     | 390/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.up_proj.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▎                                                     | 391/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▎                                                     | 391/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▍                                                     | 392/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_layernorm.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▍                                                     | 392/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_layernorm.weight]Loading weights:  52%|████████████████████████████████████████████████████████▌                                                   | 393/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  52%|████████████████████████████████████████████████████████▌                                                   | 393/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  52%|█████████████████████████████████████████████████████████████▍                                                       | 394/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_b_proj.weight]Loading weights:  52%|█████████████████████████████████████████████████████████████▍                                                       | 394/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_b_proj.weight]Loading weights:  53%|███████████████████████████████████████████████████████████████                                                         | 395/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  53%|███████████████████████████████████████████████████████████████                                                         | 395/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  53%|███████████████████████████████████████████████████████████▌                                                     | 396/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_layernorm.weight]Loading weights:  53%|███████████████████████████████████████████████████████████▌                                                     | 396/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_layernorm.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▍                                                       | 397/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▍                                                       | 397/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▌                                                       | 398/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.24.self_attn.q_b_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▌                                                       | 398/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.24.self_attn.q_b_proj.weight]Loading weights:  53%|████████████████████████████████████████████████████████████████▎                                                        | 399/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  53%|████████████████████████████████████████████████████████████████▎                                                        | 399/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  53%|████████████████████████████████████████████████████████████████▉                                                         | 400/751 [01:27<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.experts.down_proj]Loading weights:  53%|████████████████████████████████████████████████████████████████▉                                                         | 400/751 [01:27<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.experts.down_proj]Loading weights:  51%|█████████████████████████████████████████████████████████████▏                                                         | 386/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.experts.gate_up_proj]Loading weights:  51%|█████████████████████████████████████████████████████████                                                      | 386/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.gate.e_score_correction_bias]Loading weights:  51%|█████████████████████████████████████████████████████████                                                      | 386/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.gate.e_score_correction_bias]Loading weights:  52%|█████████████████████████████████████████████████████████████████▉                                                              | 387/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.gate.weight]Loading weights:  52%|█████████████████████████████████████████████████████████████████▉                                                              | 387/751 [01:27<02:20,  2.60it/s, Materializing param=model.layers.24.mlp.gate.weight]Loading weights:  52%|███████████████████████████████████████████████████████▊                                                    | 388/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.down_proj.weight]Loading weights:  52%|███████████████████████████████████████████████████████▊                                                    | 388/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.down_proj.weight]Loading weights:  52%|███████████████████████████████████████████████████████▉                                                    | 389/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.gate_proj.weight]Loading weights:  52%|███████████████████████████████████████████████████████▉                                                    | 389/751 [01:27<02:19,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.gate_proj.weight]Loading weights:  52%|█████████████████████████████████████████████████████████                                                     | 390/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.up_proj.weight]Loading weights:  52%|█████████████████████████████████████████████████████████                                                     | 390/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.mlp.shared_experts.up_proj.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▎                                                     | 391/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▎                                                     | 391/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▍                                                     | 392/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_layernorm.weight]Loading weights:  52%|██████████████████████████████████████████████████████████▍                                                     | 392/751 [01:27<02:18,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_layernorm.weight]Loading weights:  52%|████████████████████████████████████████████████████████▌                                                   | 393/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  52%|████████████████████████████████████████████████████████▌                                                   | 393/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  52%|█████████████████████████████████████████████████████████████▍                                                       | 394/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_b_proj.weight]Loading weights:  52%|█████████████████████████████████████████████████████████████▍                                                       | 394/751 [01:27<02:17,  2.60it/s, Materializing param=model.layers.24.self_attn.kv_b_proj.weight]Loading weights:  53%|███████████████████████████████████████████████████████████████                                                         | 395/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  53%|███████████████████████████████████████████████████████████████                                                         | 395/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  53%|███████████████████████████████████████████████████████████▌                                                     | 396/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_layernorm.weight]Loading weights:  53%|███████████████████████████████████████████████████████████▌                                                     | 396/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_layernorm.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▍                                                       | 397/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▍                                                       | 397/751 [01:27<02:16,  2.60it/s, Materializing param=model.layers.24.self_attn.q_a_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▌                                                       | 398/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.24.self_attn.q_b_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████▌                                                       | 398/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.24.self_attn.q_b_proj.weight]Loading weights:  53%|████████████████████████████████████████████████████████████████▎                                                        | 399/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  53%|████████████████████████████████████████████████████████████████▎                                                        | 399/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  53%|████████████████████████████████████████████████████████████████▉                                                         | 400/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.25.mlp.experts.down_proj]Loading weights:  53%|████████████████████████████████████████████████████████████████▉                                                         | 400/751 [01:27<02:15,  2.60it/s, Materializing param=model.layers.25.mlp.experts.down_proj]Loading weights:  53%|█████████████████████████████████████████████████████████████████▏                                                        | 401/751 [01:29<01:26,  4.05it/s, Materializing param=model.layers.25.mlp.experts.down_proj]Loading weights:  53%|█████████████████████████████████████████████████████████████████▏                                                        | 401/751 [01:28<01:26,  4.05it/s, Materializing param=model.layers.25.mlp.experts.down_proj]Loading weights:  53%|███████████████████████████████████████████████████████████████▌                                                       | 401/751 [01:29<01:26,  4.05it/s, Materializing param=model.layers.25.mlp.experts.gate_up_proj]Loading weights:  53%|███████████████████████████████████████████████████████████████▌                                                       | 401/751 [01:28<01:26,  4.05it/s, Materializing param=model.layers.25.mlp.experts.gate_up_proj]Loading weights:  53%|███████████████████████████████████████████████████████████████▌                                                       | 401/751 [01:29<01:26,  4.05it/s, Materializing param=model.layers.25.mlp.experts.gate_up_proj]Loading weights:  53%|███████████████████████████████████████████████████████████████▌                                                       | 401/751 [01:28<01:26,  4.05it/s, Materializing param=model.layers.25.mlp.experts.gate_up_proj]Loading weights:  54%|███████████████████████████████████████████████████████████████▋                                                       | 402/751 [01:32<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.experts.gate_up_proj]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                   | 402/751 [01:32<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.gate.e_score_correction_bias]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                   | 402/751 [01:32<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.gate.e_score_correction_bias]Loading weights:  54%|████████████████████████████████████████████████████████████████████▋                                                           | 403/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.gate.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▋                                                           | 403/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.gate.weight]Loading weights:  54%|██████████████████████████████████████████████████████████                                                  | 404/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.down_proj.weight]Loading weights:  54%|██████████████████████████████████████████████████████████                                                  | 404/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.down_proj.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▏                                                 | 405/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.gate_proj.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▏                                                 | 405/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.gate_proj.weight]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                  | 406/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.up_proj.weight]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                  | 406/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.up_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▋                                                   | 407/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▋                                                   | 407/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▊                                                   | 408/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▊                                                   | 408/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_layernorm.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▊                                                 | 409/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▊                                                 | 409/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████▊                                                     | 410/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_b_proj.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████▊                                                     | 410/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_b_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████▋                                                      | 411/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████▋                                                      | 411/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████▉                                                   | 412/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_layernorm.weight]Loading weights:  54%|███████████████████████████████████████████████████████████████▋                                                       | 402/751 [01:32<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.experts.gate_up_proj]Loading weights:  55%|█████████████████████████████████████████████████████████████▉                                                   | 412/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_layernorm.weight]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                   | 402/751 [01:32<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.gate.e_score_correction_bias]Loading weights:  55%|████████████████████████████████████████████████████████████████▉                                                     | 413/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_proj.weight]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                   | 402/751 [01:32<02:14,  2.60it/s, Materializing param=model.layers.25.mlp.gate.e_score_correction_bias]Loading weights:  55%|████████████████████████████████████████████████████████████████▉                                                     | 413/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▋                                                           | 403/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.gate.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████                                                     | 414/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.25.self_attn.q_b_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▋                                                           | 403/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.gate.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████                                                     | 414/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.25.self_attn.q_b_proj.weight]Loading weights:  54%|██████████████████████████████████████████████████████████                                                  | 404/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.down_proj.weight]Loading weights:  55%|██████████████████████████████████████████████████████████████████▊                                                      | 415/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  54%|██████████████████████████████████████████████████████████                                                  | 404/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.down_proj.weight]Loading weights:  55%|██████████████████████████████████████████████████████████████████▊                                                      | 415/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▏                                                 | 405/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.gate_proj.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████████▌                                                      | 416/751 [01:32<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.experts.down_proj]Loading weights:  54%|██████████████████████████████████████████████████████████▏                                                 | 405/751 [01:32<02:13,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.gate_proj.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████████▌                                                      | 416/751 [01:32<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.experts.down_proj]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                  | 406/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.up_proj.weight]Loading weights:  54%|███████████████████████████████████████████████████████████▍                                                  | 406/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.mlp.shared_experts.up_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▋                                                   | 407/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▋                                                   | 407/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▊                                                   | 408/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████▊                                                   | 408/751 [01:32<02:12,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_layernorm.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▊                                                 | 409/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  54%|██████████████████████████████████████████████████████████▊                                                 | 409/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████▊                                                     | 410/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_b_proj.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████▊                                                     | 410/751 [01:32<02:11,  2.60it/s, Materializing param=model.layers.25.self_attn.kv_b_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████▋                                                      | 411/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████▋                                                      | 411/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████▉                                                   | 412/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_layernorm.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████▉                                                   | 412/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_layernorm.weight]Loading weights:  55%|████████████████████████████████████████████████████████████████▉                                                     | 413/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_proj.weight]Loading weights:  55%|████████████████████████████████████████████████████████████████▉                                                     | 413/751 [01:32<02:10,  2.60it/s, Materializing param=model.layers.25.self_attn.q_a_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████                                                     | 414/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.25.self_attn.q_b_proj.weight]Loading weights:  55%|█████████████████████████████████████████████████████████████████                                                     | 414/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.25.self_attn.q_b_proj.weight]Loading weights:  55%|██████████████████████████████████████████████████████████████████▊                                                      | 415/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  55%|██████████████████████████████████████████████████████████████████▊                                                      | 415/751 [01:32<02:09,  2.60it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████████▌                                                      | 416/751 [01:32<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.experts.down_proj]Loading weights:  55%|███████████████████████████████████████████████████████████████████▌                                                      | 416/751 [01:32<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.experts.down_proj]Loading weights:  56%|███████████████████████████████████████████████████████████████████▋                                                      | 417/751 [01:34<01:22,  4.04it/s, Materializing param=model.layers.26.mlp.experts.down_proj]Loading weights:  56%|███████████████████████████████████████████████████████████████████▋                                                      | 417/751 [01:33<01:22,  4.04it/s, Materializing param=model.layers.26.mlp.experts.down_proj]Loading weights:  56%|██████████████████████████████████████████████████████████████████                                                     | 417/751 [01:34<01:22,  4.04it/s, Materializing param=model.layers.26.mlp.experts.gate_up_proj]Loading weights:  56%|██████████████████████████████████████████████████████████████████                                                     | 417/751 [01:33<01:22,  4.04it/s, Materializing param=model.layers.26.mlp.experts.gate_up_proj]Loading weights:  56%|██████████████████████████████████████████████████████████████████                                                     | 417/751 [01:34<01:22,  4.04it/s, Materializing param=model.layers.26.mlp.experts.gate_up_proj]Loading weights:  56%|██████████████████████████████████████████████████████████████████                                                     | 417/751 [01:33<01:22,  4.04it/s, Materializing param=model.layers.26.mlp.experts.gate_up_proj]Loading weights:  56%|██████████████████████████████████████████████████████████████████▏                                                    | 418/751 [01:37<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.experts.gate_up_proj]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                 | 418/751 [01:37<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.gate.e_score_correction_bias]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                 | 418/751 [01:37<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.gate.e_score_correction_bias]Loading weights:  56%|███████████████████████████████████████████████████████████████████████▍                                                        | 419/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.gate.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████████████▍                                                        | 419/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.gate.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▍                                               | 420/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.down_proj.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▍                                               | 420/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.down_proj.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▌                                               | 421/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.gate_proj.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▌                                               | 421/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.gate_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                | 422/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.up_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                | 422/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.up_proj.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████                                                 | 423/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████                                                 | 423/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████▏                                                | 424/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_layernorm.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████▏                                                | 424/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_layernorm.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████                                               | 425/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████                                               | 425/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  57%|██████████████████████████████████████████████████████████████████▎                                                  | 426/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_b_proj.weight]Loading weights:  57%|██████████████████████████████████████████████████████████████████▎                                                  | 426/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_b_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████▏                                                   | 427/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████▏                                                   | 427/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████▍                                                | 428/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_layernorm.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████▍                                                | 428/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_layernorm.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▍                                                  | 429/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_proj.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▍                                                  | 429/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_proj.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▌                                                  | 430/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_b_proj.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▌                                                  | 430/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_b_proj.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████████████▍                                                   | 431/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████████████▍                                                   | 431/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████████████▏                                                   | 432/751 [01:37<02:02,  2.60it/s, Materializing param=model.layers.27.mlp.experts.down_proj]Loading weights:  58%|██████████████████████████████████████████████████████████████████████▏                                                   | 432/751 [01:37<02:02,  2.60it/s, Materializing param=model.layers.27.mlp.experts.down_proj]Loading weights:  56%|██████████████████████████████████████████████████████████████████▏                                                    | 418/751 [01:37<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.experts.gate_up_proj]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                 | 418/751 [01:37<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.gate.e_score_correction_bias]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                 | 418/751 [01:37<02:08,  2.60it/s, Materializing param=model.layers.26.mlp.gate.e_score_correction_bias]Loading weights:  56%|███████████████████████████████████████████████████████████████████████▍                                                        | 419/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.gate.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████████████▍                                                        | 419/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.gate.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▍                                               | 420/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.down_proj.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▍                                               | 420/751 [01:37<02:07,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.down_proj.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▌                                               | 421/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.gate_proj.weight]Loading weights:  56%|████████████████████████████████████████████████████████████▌                                               | 421/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.gate_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                | 422/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.up_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████▊                                                | 422/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.mlp.shared_experts.up_proj.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████                                                 | 423/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████                                                 | 423/751 [01:37<02:06,  2.60it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████▏                                                | 424/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_layernorm.weight]Loading weights:  56%|███████████████████████████████████████████████████████████████▏                                                | 424/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_layernorm.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████                                               | 425/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████                                               | 425/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  57%|██████████████████████████████████████████████████████████████████▎                                                  | 426/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_b_proj.weight]Loading weights:  57%|██████████████████████████████████████████████████████████████████▎                                                  | 426/751 [01:37<02:05,  2.60it/s, Materializing param=model.layers.26.self_attn.kv_b_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████▏                                                   | 427/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████▏                                                   | 427/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████▍                                                | 428/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_layernorm.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████▍                                                | 428/751 [01:37<02:04,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_layernorm.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▍                                                  | 429/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_proj.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▍                                                  | 429/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_a_proj.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▌                                                  | 430/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_b_proj.weight]Loading weights:  57%|███████████████████████████████████████████████████████████████████▌                                                  | 430/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.26.self_attn.q_b_proj.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████████████▍                                                   | 431/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  57%|█████████████████████████████████████████████████████████████████████▍                                                   | 431/751 [01:37<02:03,  2.60it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████████████▏                                                   | 432/751 [01:37<02:02,  2.60it/s, Materializing param=model.layers.27.mlp.experts.down_proj]Loading weights:  58%|██████████████████████████████████████████████████████████████████████▏                                                   | 432/751 [01:37<02:02,  2.60it/s, Materializing param=model.layers.27.mlp.experts.down_proj]Loading weights:  58%|██████████████████████████████████████████████████████████████████████▎                                                   | 433/751 [01:38<01:16,  4.16it/s, Materializing param=model.layers.27.mlp.experts.down_proj]Loading weights:  58%|██████████████████████████████████████████████████████████████████████▎                                                   | 433/751 [01:38<01:16,  4.15it/s, Materializing param=model.layers.27.mlp.experts.down_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████████▌                                                  | 433/751 [01:38<01:16,  4.16it/s, Materializing param=model.layers.27.mlp.experts.gate_up_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████████▌                                                  | 433/751 [01:38<01:16,  4.15it/s, Materializing param=model.layers.27.mlp.experts.gate_up_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████████▌                                                  | 433/751 [01:38<01:16,  4.16it/s, Materializing param=model.layers.27.mlp.experts.gate_up_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████████▌                                                  | 433/751 [01:38<01:16,  4.15it/s, Materializing param=model.layers.27.mlp.experts.gate_up_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████████▊                                                  | 434/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.experts.gate_up_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                              | 434/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.e_score_correction_bias]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                              | 434/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.e_score_correction_bias]Loading weights:  58%|██████████████████████████████████████████████████████████████████████████▏                                                     | 435/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████████████████▏                                                     | 435/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▋                                             | 436/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.down_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▋                                             | 436/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.down_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▊                                             | 437/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.gate_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▊                                             | 437/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.gate_proj.weight]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                             | 438/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.up_proj.weight]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                             | 438/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.up_proj.weight]Loading weights:  58%|█████████████████████████████████████████████████████████████████▍                                              | 439/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  58%|█████████████████████████████████████████████████████████████████▍                                              | 439/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████▌                                              | 440/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_layernorm.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████▌                                              | 440/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_layernorm.weight]Loading weights:  59%|███████████████████████████████████████████████████████████████▍                                            | 441/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  59%|███████████████████████████████████████████████████████████████▍                                            | 441/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  59%|████████████████████████████████████████████████████████████████████▊                                                | 442/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_b_proj.weight]Loading weights:  59%|████████████████████████████████████████████████████████████████████▊                                                | 442/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_b_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████▊                                                 | 443/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████▊                                                 | 443/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████▊                                              | 444/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_layernorm.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████▊                                              | 444/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_layernorm.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████▉                                                | 445/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_proj.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████▉                                                | 445/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████                                                | 446/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_b_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████                                                | 446/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_b_proj.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████                                                 | 447/751 [01:41<01:46,  2.84it/s, Materializing param=model.layers.28.input_layernorm.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████                                                 | 447/751 [01:41<01:46,  2.84it/s, Materializing param=model.layers.28.input_layernorm.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████▊                                                 | 448/751 [01:41<01:46,  2.84it/s, Materializing param=model.layers.28.mlp.experts.down_proj]Loading weights:  60%|████████████████████████████████████████████████████████████████████████▊                                                 | 448/751 [01:41<01:46,  2.84it/s, Materializing param=model.layers.28.mlp.experts.down_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████████▊                                                  | 434/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.experts.gate_up_proj]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                              | 434/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.e_score_correction_bias]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                              | 434/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.e_score_correction_bias]Loading weights:  58%|██████████████████████████████████████████████████████████████████████████▏                                                     | 435/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████████████████▏                                                     | 435/751 [01:41<01:51,  2.84it/s, Materializing param=model.layers.27.mlp.gate.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▋                                             | 436/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.down_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▋                                             | 436/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.down_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▊                                             | 437/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.gate_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████▊                                             | 437/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.gate_proj.weight]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                             | 438/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.up_proj.weight]Loading weights:  58%|████████████████████████████████████████████████████████████████▏                                             | 438/751 [01:41<01:50,  2.84it/s, Materializing param=model.layers.27.mlp.shared_experts.up_proj.weight]Loading weights:  58%|█████████████████████████████████████████████████████████████████▍                                              | 439/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  58%|█████████████████████████████████████████████████████████████████▍                                              | 439/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████▌                                              | 440/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_layernorm.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████▌                                              | 440/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_layernorm.weight]Loading weights:  59%|███████████████████████████████████████████████████████████████▍                                            | 441/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  59%|███████████████████████████████████████████████████████████████▍                                            | 441/751 [01:41<01:49,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  59%|████████████████████████████████████████████████████████████████████▊                                                | 442/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_b_proj.weight]Loading weights:  59%|████████████████████████████████████████████████████████████████████▊                                                | 442/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.kv_b_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████▊                                                 | 443/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████▊                                                 | 443/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████▊                                              | 444/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_layernorm.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████▊                                              | 444/751 [01:41<01:48,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_layernorm.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████▉                                                | 445/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_proj.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████▉                                                | 445/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_a_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████                                                | 446/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_b_proj.weight]Loading weights:  59%|██████████████████████████████████████████████████████████████████████                                                | 446/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.27.self_attn.q_b_proj.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████                                                 | 447/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.28.input_layernorm.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████                                                 | 447/751 [01:41<01:47,  2.84it/s, Materializing param=model.layers.28.input_layernorm.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████▊                                                 | 448/751 [01:41<01:46,  2.84it/s, Materializing param=model.layers.28.mlp.experts.down_proj]Loading weights:  60%|████████████████████████████████████████████████████████████████████████▊                                                 | 448/751 [01:41<01:46,  2.84it/s, Materializing param=model.layers.28.mlp.experts.down_proj]Loading weights:  60%|████████████████████████████████████████████████████████████████████████▉                                                 | 449/751 [01:42<01:07,  4.50it/s, Materializing param=model.layers.28.mlp.experts.down_proj]Loading weights:  60%|████████████████████████████████████████████████████████████████████████▉                                                 | 449/751 [01:42<01:07,  4.50it/s, Materializing param=model.layers.28.mlp.experts.down_proj]Loading weights:  60%|███████████████████████████████████████████████████████████████████████▏                                               | 449/751 [01:42<01:07,  4.50it/s, Materializing param=model.layers.28.mlp.experts.gate_up_proj]Loading weights:  60%|███████████████████████████████████████████████████████████████████████▏                                               | 449/751 [01:42<01:07,  4.50it/s, Materializing param=model.layers.28.mlp.experts.gate_up_proj]Loading weights:  60%|███████████████████████████████████████████████████████████████████████▏                                               | 449/751 [01:42<01:07,  4.50it/s, Materializing param=model.layers.28.mlp.experts.gate_up_proj]Loading weights:  60%|███████████████████████████████████████████████████████████████████████▏                                               | 449/751 [01:42<01:07,  4.50it/s, Materializing param=model.layers.28.mlp.experts.gate_up_proj]Loading weights:  60%|███████████████████████████████████████████████████████████████████████▎                                               | 450/751 [01:44<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.experts.gate_up_proj]Loading weights:  60%|██████████████████████████████████████████████████████████████████▌                                            | 450/751 [01:44<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.e_score_correction_bias]Loading weights:  60%|██████████████████████████████████████████████████████████████████▌                                            | 450/751 [01:44<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.e_score_correction_bias]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▊                                                   | 451/751 [01:44<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▊                                                   | 451/751 [01:44<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████                                           | 452/751 [01:44<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.down_proj.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████                                           | 452/751 [01:44<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.down_proj.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████▏                                          | 453/751 [01:44<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.gate_proj.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████▏                                          | 453/751 [01:44<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.gate_proj.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████▍                                           | 454/751 [01:44<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.up_proj.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████▍                                           | 454/751 [01:44<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.up_proj.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████▊                                            | 455/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.post_attention_layernorm.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████▊                                            | 455/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.post_attention_layernorm.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████                                            | 456/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_layernorm.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████                                            | 456/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_layernorm.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████▋                                          | 457/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████▋                                          | 457/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████████▎                                             | 458/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_b_proj.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████████▎                                             | 458/751 [01:44<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_b_proj.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████████▎                                              | 459/751 [01:44<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.o_proj.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████████▎                                              | 459/751 [01:44<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.o_proj.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████▏                                           | 460/751 [01:44<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_layernorm.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████▏                                           | 460/751 [01:44<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_layernorm.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████████▍                                             | 461/751 [01:44<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_proj.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████████▍                                             | 461/751 [01:44<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████▌                                             | 462/751 [01:44<01:31,  3.15it/s, Materializing param=model.layers.28.self_attn.q_b_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████▌                                             | 462/751 [01:44<01:31,  3.15it/s, Materializing param=model.layers.28.self_attn.q_b_proj.weight]Loading weights:  62%|██████████████████████████████████████████████████████████████████████████▌                                              | 463/751 [01:44<01:31,  3.15it/s, Materializing param=model.layers.29.input_layernorm.weight]Loading weights:  62%|██████████████████████████████████████████████████████████████████████████▌                                              | 463/751 [01:44<01:31,  3.15it/s, Materializing param=model.layers.29.input_layernorm.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████▍                                              | 464/751 [01:44<01:31,  3.15it/s, Materializing param=model.layers.29.mlp.experts.down_proj]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████▍                                              | 464/751 [01:44<01:31,  3.15it/s, Materializing param=model.layers.29.mlp.experts.down_proj]Loading weights:  60%|███████████████████████████████████████████████████████████████████████▎                                               | 450/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.experts.gate_up_proj]Loading weights:  60%|██████████████████████████████████████████████████████████████████▌                                            | 450/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.e_score_correction_bias]Loading weights:  60%|██████████████████████████████████████████████████████████████████▌                                            | 450/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.e_score_correction_bias]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▊                                                   | 451/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▊                                                   | 451/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.gate.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████                                           | 452/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.down_proj.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████                                           | 452/751 [01:45<01:35,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.down_proj.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████▏                                          | 453/751 [01:45<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.gate_proj.weight]Loading weights:  60%|█████████████████████████████████████████████████████████████████▏                                          | 453/751 [01:45<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.gate_proj.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████▍                                           | 454/751 [01:45<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.up_proj.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████▍                                           | 454/751 [01:45<01:34,  3.15it/s, Materializing param=model.layers.28.mlp.shared_experts.up_proj.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████▊                                            | 455/751 [01:45<01:34,  3.15it/s, Materializing param=model.layers.28.post_attention_layernorm.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████▊                                            | 455/751 [01:45<01:34,  3.15it/s, Materializing param=model.layers.28.post_attention_layernorm.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████                                            | 456/751 [01:45<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_layernorm.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████                                            | 456/751 [01:45<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_layernorm.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████▋                                          | 457/751 [01:45<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████▋                                          | 457/751 [01:45<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████████▎                                             | 458/751 [01:45<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_b_proj.weight]Loading weights:  61%|███████████████████████████████████████████████████████████████████████▎                                             | 458/751 [01:45<01:33,  3.15it/s, Materializing param=model.layers.28.self_attn.kv_b_proj.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████████▎                                              | 459/751 [01:45<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.o_proj.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████████▎                                              | 459/751 [01:45<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.o_proj.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████▏                                           | 460/751 [01:45<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_layernorm.weight]Loading weights:  61%|█████████████████████████████████████████████████████████████████████▏                                           | 460/751 [01:45<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_layernorm.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████████▍                                             | 461/751 [01:45<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_proj.weight]Loading weights:  61%|████████████████████████████████████████████████████████████████████████▍                                             | 461/751 [01:45<01:32,  3.15it/s, Materializing param=model.layers.28.self_attn.q_a_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████▌                                             | 462/751 [01:45<01:31,  3.15it/s, Materializing param=model.layers.28.self_attn.q_b_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████▌                                             | 462/751 [01:45<01:31,  3.15it/s, Materializing param=model.layers.28.self_attn.q_b_proj.weight]Loading weights:  62%|██████████████████████████████████████████████████████████████████████████▌                                              | 463/751 [01:45<01:31,  3.15it/s, Materializing param=model.layers.29.input_layernorm.weight]Loading weights:  62%|██████████████████████████████████████████████████████████████████████████▌                                              | 463/751 [01:45<01:31,  3.15it/s, Materializing param=model.layers.29.input_layernorm.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████▍                                              | 464/751 [01:45<01:31,  3.15it/s, Materializing param=model.layers.29.mlp.experts.down_proj]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████▍                                              | 464/751 [01:45<01:31,  3.15it/s, Materializing param=model.layers.29.mlp.experts.down_proj]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████▌                                              | 465/751 [01:45<00:55,  5.18it/s, Materializing param=model.layers.29.mlp.experts.down_proj]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████▌                                              | 465/751 [01:46<00:55,  5.18it/s, Materializing param=model.layers.29.mlp.experts.down_proj]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████▋                                             | 465/751 [01:45<00:55,  5.18it/s, Materializing param=model.layers.29.mlp.experts.gate_up_proj]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████▋                                             | 465/751 [01:46<00:55,  5.18it/s, Materializing param=model.layers.29.mlp.experts.gate_up_proj]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████▋                                             | 465/751 [01:45<00:55,  5.18it/s, Materializing param=model.layers.29.mlp.experts.gate_up_proj]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████▋                                             | 465/751 [01:46<00:55,  5.18it/s, Materializing param=model.layers.29.mlp.experts.gate_up_proj]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████▊                                             | 466/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.experts.gate_up_proj]Loading weights:  62%|████████████████████████████████████████████████████████████████████▉                                          | 466/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.e_score_correction_bias]Loading weights:  62%|████████████████████████████████████████████████████████████████████▉                                          | 466/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.e_score_correction_bias]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████████▌                                                | 467/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████████▌                                                | 467/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▎                                        | 468/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.down_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▎                                        | 468/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.down_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▍                                        | 469/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.gate_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▍                                        | 469/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.gate_proj.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████▊                                         | 470/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.up_proj.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████▊                                         | 470/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.up_proj.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▏                                         | 471/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.post_attention_layernorm.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▏                                         | 471/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.post_attention_layernorm.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▍                                         | 472/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_layernorm.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▍                                         | 472/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_layernorm.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████                                        | 473/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████                                        | 473/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████▊                                             | 466/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.experts.gate_up_proj]Loading weights:  63%|█████████████████████████████████████████████████████████████████████████▊                                           | 474/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_b_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████▉                                          | 466/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.e_score_correction_bias]Loading weights:  63%|█████████████████████████████████████████████████████████████████████████▊                                           | 474/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_b_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████▉                                          | 466/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.e_score_correction_bias]Loading weights:  63%|███████████████████████████████████████████████████████████████████████████▉                                            | 475/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.o_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████████▌                                                | 467/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████████▉                                            | 475/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.o_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████████████████▌                                                | 467/751 [01:47<01:14,  3.83it/s, Materializing param=model.layers.29.mlp.gate.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████▌                                         | 476/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_layernorm.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████▌                                         | 476/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_layernorm.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▎                                        | 468/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.down_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▎                                        | 468/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.down_proj.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████▉                                           | 477/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_proj.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████▉                                           | 477/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▍                                        | 469/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.gate_proj.weight]Loading weights:  64%|███████████████████████████████████████████████████████████████████████████                                           | 478/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_b_proj.weight]Loading weights:  62%|███████████████████████████████████████████████████████████████████▍                                        | 469/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.gate_proj.weight]Loading weights:  64%|███████████████████████████████████████████████████████████████████████████                                           | 478/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_b_proj.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████▊                                         | 470/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.up_proj.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▏                                           | 479/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.30.input_layernorm.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████▊                                         | 470/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.mlp.shared_experts.up_proj.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▏                                           | 479/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.30.input_layernorm.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▏                                         | 471/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.post_attention_layernorm.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▉                                            | 480/751 [01:47<01:10,  3.83it/s, Materializing param=model.layers.30.mlp.experts.down_proj]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▏                                         | 471/751 [01:47<01:13,  3.83it/s, Materializing param=model.layers.29.post_attention_layernorm.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▉                                            | 480/751 [01:47<01:10,  3.83it/s, Materializing param=model.layers.30.mlp.experts.down_proj]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▍                                         | 472/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_layernorm.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████▍                                         | 472/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_layernorm.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████                                        | 473/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████                                        | 473/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  63%|█████████████████████████████████████████████████████████████████████████▊                                           | 474/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_b_proj.weight]Loading weights:  63%|█████████████████████████████████████████████████████████████████████████▊                                           | 474/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.kv_b_proj.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████████▉                                            | 475/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.o_proj.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████████▉                                            | 475/751 [01:47<01:12,  3.83it/s, Materializing param=model.layers.29.self_attn.o_proj.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████▌                                         | 476/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_layernorm.weight]Loading weights:  63%|███████████████████████████████████████████████████████████████████████▌                                         | 476/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_layernorm.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████▉                                           | 477/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_proj.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████▉                                           | 477/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_a_proj.weight]Loading weights:  64%|███████████████████████████████████████████████████████████████████████████                                           | 478/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_b_proj.weight]Loading weights:  64%|███████████████████████████████████████████████████████████████████████████                                           | 478/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.29.self_attn.q_b_proj.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▏                                           | 479/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.30.input_layernorm.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▏                                           | 479/751 [01:47<01:11,  3.83it/s, Materializing param=model.layers.30.input_layernorm.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▉                                            | 480/751 [01:47<01:10,  3.83it/s, Materializing param=model.layers.30.mlp.experts.down_proj]Loading weights:  64%|█████████████████████████████████████████████████████████████████████████████▉                                            | 480/751 [01:47<01:10,  3.83it/s, Materializing param=model.layers.30.mlp.experts.down_proj]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████▏                                           | 481/751 [01:48<00:42,  6.33it/s, Materializing param=model.layers.30.mlp.experts.down_proj]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████▏                                           | 481/751 [01:48<00:42,  6.34it/s, Materializing param=model.layers.30.mlp.experts.down_proj]Loading weights:  64%|████████████████████████████████████████████████████████████████████████████▏                                          | 481/751 [01:48<00:42,  6.33it/s, Materializing param=model.layers.30.mlp.experts.gate_up_proj]Loading weights:  64%|████████████████████████████████████████████████████████████████████████████▏                                          | 481/751 [01:48<00:42,  6.34it/s, Materializing param=model.layers.30.mlp.experts.gate_up_proj]Loading weights:  64%|████████████████████████████████████████████████████████████████████████████▏                                          | 481/751 [01:48<00:42,  6.33it/s, Materializing param=model.layers.30.mlp.experts.gate_up_proj]Loading weights:  64%|████████████████████████████████████████████████████████████████████████████▏                                          | 481/751 [01:48<00:42,  6.34it/s, Materializing param=model.layers.30.mlp.experts.gate_up_proj]Loading weights:  64%|███████████████████████████████████████████████████████████████████████▏                                       | 482/751 [01:50<00:42,  6.34it/s, Materializing param=model.layers.30.mlp.gate.e_score_correction_bias]Loading weights:  64%|███████████████████████████████████████████████████████████████████████▏                                       | 482/751 [01:50<00:42,  6.34it/s, Materializing param=model.layers.30.mlp.gate.e_score_correction_bias]Loading weights:  64%|███████████████████████████████████████████████████████████████████████▍                                       | 483/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.gate.e_score_correction_bias]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████████▎                                             | 483/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.gate.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████████▎                                             | 483/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.gate.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████▌                                      | 484/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.down_proj.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████▌                                      | 484/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.down_proj.weight]Loading weights:  65%|█████████████████████████████████████████████████████████████████████▋                                      | 485/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.gate_proj.weight]Loading weights:  65%|█████████████████████████████████████████████████████████████████████▋                                      | 485/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.gate_proj.weight]Loading weights:  65%|███████████████████████████████████████████████████████████████████████▏                                      | 486/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.up_proj.weight]Loading weights:  65%|███████████████████████████████████████████████████████████████████████▏                                      | 486/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.up_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▋                                       | 487/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.post_attention_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▋                                       | 487/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.post_attention_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▊                                       | 488/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▊                                       | 488/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_layernorm.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 489/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 489/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████▎                                        | 490/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_b_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████▎                                        | 490/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_b_proj.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████████████▍                                         | 491/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.o_proj.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████████████▍                                         | 491/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.o_proj.weight]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████                                       | 492/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_layernorm.weight]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████                                       | 492/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_layernorm.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▍                                        | 493/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▍                                        | 493/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▌                                        | 494/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.30.self_attn.q_b_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▌                                        | 494/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.30.self_attn.q_b_proj.weight]Loading weights:  66%|███████████████████████████████████████████████████████████████████████████████▊                                         | 495/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.input_layernorm.weight]Loading weights:  66%|███████████████████████████████████████████████████████████████████████████████▊                                         | 495/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.input_layernorm.weight]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████▌                                         | 496/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.mlp.experts.down_proj]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████▌                                         | 496/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.mlp.experts.down_proj]Loading weights:  64%|███████████████████████████████████████████████████████████████████████▏                                       | 482/751 [01:50<00:42,  6.33it/s, Materializing param=model.layers.30.mlp.gate.e_score_correction_bias]Loading weights:  64%|███████████████████████████████████████████████████████████████████████▏                                       | 482/751 [01:50<00:42,  6.33it/s, Materializing param=model.layers.30.mlp.gate.e_score_correction_bias]Loading weights:  64%|███████████████████████████████████████████████████████████████████████▍                                       | 483/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.gate.e_score_correction_bias]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████████▎                                             | 483/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.gate.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████████▎                                             | 483/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.gate.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████▌                                      | 484/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.down_proj.weight]Loading weights:  64%|█████████████████████████████████████████████████████████████████████▌                                      | 484/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.down_proj.weight]Loading weights:  65%|█████████████████████████████████████████████████████████████████████▋                                      | 485/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.gate_proj.weight]Loading weights:  65%|█████████████████████████████████████████████████████████████████████▋                                      | 485/751 [01:50<01:01,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.gate_proj.weight]Loading weights:  65%|███████████████████████████████████████████████████████████████████████▏                                      | 486/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.up_proj.weight]Loading weights:  65%|███████████████████████████████████████████████████████████████████████▏                                      | 486/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.mlp.shared_experts.up_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▋                                       | 487/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.post_attention_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▋                                       | 487/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.post_attention_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▊                                       | 488/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████▊                                       | 488/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_layernorm.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 489/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 489/751 [01:50<01:00,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████▎                                        | 490/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_b_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████▎                                        | 490/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.kv_b_proj.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████████████▍                                         | 491/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.o_proj.weight]Loading weights:  65%|██████████████████████████████████████████████████████████████████████████████▍                                         | 491/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.o_proj.weight]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████                                       | 492/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_layernorm.weight]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████                                       | 492/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_layernorm.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▍                                        | 493/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▍                                        | 493/751 [01:50<00:59,  4.36it/s, Materializing param=model.layers.30.self_attn.q_a_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▌                                        | 494/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.30.self_attn.q_b_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████▌                                        | 494/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.30.self_attn.q_b_proj.weight]Loading weights:  66%|███████████████████████████████████████████████████████████████████████████████▊                                         | 495/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.input_layernorm.weight]Loading weights:  66%|███████████████████████████████████████████████████████████████████████████████▊                                         | 495/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.input_layernorm.weight]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████▌                                         | 496/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.mlp.experts.down_proj]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████▌                                         | 496/751 [01:50<00:58,  4.36it/s, Materializing param=model.layers.31.mlp.experts.down_proj]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████▋                                         | 497/751 [01:51<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.experts.down_proj]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████▋                                         | 497/751 [01:51<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.experts.down_proj]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████████▊                                        | 497/751 [01:51<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.experts.gate_up_proj]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████████▊                                        | 497/751 [01:51<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.experts.gate_up_proj]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████████▊                                        | 497/751 [01:51<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.experts.gate_up_proj]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████████▊                                        | 497/751 [01:51<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.experts.gate_up_proj]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████▌                                     | 498/751 [01:52<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.gate.e_score_correction_bias]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████▌                                     | 498/751 [01:52<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.gate.e_score_correction_bias]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████▊                                     | 499/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.gate.e_score_correction_bias]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████████████                                           | 499/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.gate.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████████████                                           | 499/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.gate.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 500/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.down_proj.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 500/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.down_proj.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████                                    | 501/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.gate_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████▌                                     | 498/751 [01:52<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.gate.e_score_correction_bias]Loading weights:  67%|████████████████████████████████████████████████████████████████████████                                    | 501/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.gate_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████▌                                     | 498/751 [01:52<00:38,  6.65it/s, Materializing param=model.layers.31.mlp.gate.e_score_correction_bias]Loading weights:  67%|█████████████████████████████████████████████████████████████████████████▌                                    | 502/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.up_proj.weight]Loading weights:  67%|█████████████████████████████████████████████████████████████████████████▌                                    | 502/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.up_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████▊                                     | 499/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.gate.e_score_correction_bias]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████                                     | 503/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.post_attention_layernorm.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████████████                                           | 499/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.gate.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████                                     | 503/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.post_attention_layernorm.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████████████                                           | 499/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.gate.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████▏                                    | 504/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_layernorm.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 500/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.down_proj.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████▏                                    | 504/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_layernorm.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 500/751 [01:52<00:48,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.down_proj.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████                                    | 501/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.gate_proj.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████▌                                   | 505/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████                                    | 501/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.gate_proj.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████▌                                   | 505/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  67%|█████████████████████████████████████████████████████████████████████████▌                                    | 502/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.up_proj.weight]Loading weights:  67%|█████████████████████████████████████████████████████████████████████████▌                                    | 502/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.mlp.shared_experts.up_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████▊                                      | 506/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_b_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████▊                                      | 506/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_b_proj.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████                                     | 503/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.post_attention_layernorm.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████                                     | 503/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.post_attention_layernorm.weight]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████                                       | 507/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.o_proj.weight]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████                                       | 507/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.o_proj.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████▏                                    | 504/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_layernorm.weight]Loading weights:  67%|███████████████████████████████████████████████████████████████████████████▏                                    | 504/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_layernorm.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████▍                                    | 508/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_layernorm.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████▍                                    | 508/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_layernorm.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████▌                                   | 505/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████▌                                   | 505/751 [01:52<00:47,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████▉                                      | 509/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_proj.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████▉                                      | 509/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████▊                                      | 506/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_b_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████▊                                      | 506/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.kv_b_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████▏                                     | 510/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_b_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████▏                                     | 510/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_b_proj.weight]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████                                       | 507/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.o_proj.weight]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████                                       | 507/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.o_proj.weight]Loading weights:  68%|██████████████████████████████████████████████████████████████████████████████████▎                                      | 511/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.input_layernorm.weight]Loading weights:  68%|██████████████████████████████████████████████████████████████████████████████████▎                                      | 511/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.input_layernorm.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████▍                                    | 508/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_layernorm.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████▍                                    | 508/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_layernorm.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▏                                      | 512/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.mlp.experts.down_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████▉                                      | 509/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_proj.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▏                                      | 512/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.mlp.experts.down_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████▉                                      | 509/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_a_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████▏                                     | 510/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_b_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████▏                                     | 510/751 [01:52<00:46,  5.22it/s, Materializing param=model.layers.31.self_attn.q_b_proj.weight]Loading weights:  68%|██████████████████████████████████████████████████████████████████████████████████▎                                      | 511/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.input_layernorm.weight]Loading weights:  68%|██████████████████████████████████████████████████████████████████████████████████▎                                      | 511/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.input_layernorm.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▏                                      | 512/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.mlp.experts.down_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▏                                      | 512/751 [01:52<00:45,  5.22it/s, Materializing param=model.layers.32.mlp.experts.down_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▎                                      | 513/751 [01:53<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.experts.down_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▎                                      | 513/751 [01:53<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.experts.down_proj]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████▎                                     | 513/751 [01:53<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.experts.gate_up_proj]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████▎                                     | 513/751 [01:53<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.experts.gate_up_proj]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████▎                                     | 513/751 [01:53<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.experts.gate_up_proj]Loading weights:  68%|█████████████████████████████████████████████████████████████████████████████████▎                                     | 513/751 [01:53<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.experts.gate_up_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████▉                                   | 514/751 [01:55<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.gate.e_score_correction_bias]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████▉                                   | 514/751 [01:55<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.gate.e_score_correction_bias]Loading weights:  69%|████████████████████████████████████████████████████████████████████████████                                   | 515/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.gate.e_score_correction_bias]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████████████████▊                                        | 515/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.gate.weight]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████████████████▊                                        | 515/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.gate.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▏                                 | 516/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.shared_experts.down_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▏                                 | 516/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.shared_experts.down_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▎                                 | 517/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.shared_experts.gate_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▎                                 | 517/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.shared_experts.gate_proj.weight]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████▊                                  | 518/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.shared_experts.up_proj.weight]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████▊                                  | 518/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.mlp.shared_experts.up_proj.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 519/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.post_attention_layernorm.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 519/751 [01:55<00:44,  5.26it/s, Materializing param=model.layers.32.post_attention_layernorm.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 520/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.kv_a_layernorm.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 520/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.kv_a_layernorm.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▉                                 | 521/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▉                                 | 521/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████▎                                   | 522/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.kv_b_proj.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████▎                                   | 522/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.kv_b_proj.weight]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▌                                    | 523/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.o_proj.weight]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▌                                    | 523/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.o_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████▊                                  | 524/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.q_a_layernorm.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████▊                                  | 524/751 [01:55<00:43,  5.26it/s, Materializing param=model.layers.32.self_attn.q_a_layernorm.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▍                                   | 525/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.32.self_attn.q_a_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▍                                   | 525/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.32.self_attn.q_a_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▋                                   | 526/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.32.self_attn.q_b_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▋                                   | 526/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.32.self_attn.q_b_proj.weight]Loading weights:  70%|████████████████████████████████████████████████████████████████████████████████████▉                                    | 527/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.33.input_layernorm.weight]Loading weights:  70%|████████████████████████████████████████████████████████████████████████████████████▉                                    | 527/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.33.input_layernorm.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████████▊                                    | 528/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.33.mlp.experts.down_proj]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████████▊                                    | 528/751 [01:55<00:42,  5.26it/s, Materializing param=model.layers.33.mlp.experts.down_proj]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████▉                                   | 514/751 [01:54<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.gate.e_score_correction_bias]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████▉                                   | 514/751 [01:54<00:30,  7.80it/s, Materializing param=model.layers.32.mlp.gate.e_score_correction_bias]Loading weights:  69%|████████████████████████████████████████████████████████████████████████████                                   | 515/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.gate.e_score_correction_bias]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████████████████▊                                        | 515/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.gate.weight]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████████████████▊                                        | 515/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.gate.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▏                                 | 516/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.shared_experts.down_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▏                                 | 516/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.shared_experts.down_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▎                                 | 517/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.shared_experts.gate_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▎                                 | 517/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.shared_experts.gate_proj.weight]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████▊                                  | 518/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.shared_experts.up_proj.weight]Loading weights:  69%|███████████████████████████████████████████████████████████████████████████▊                                  | 518/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.mlp.shared_experts.up_proj.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 519/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.post_attention_layernorm.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▍                                  | 519/751 [01:54<00:44,  5.25it/s, Materializing param=model.layers.32.post_attention_layernorm.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 520/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.kv_a_layernorm.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 520/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.kv_a_layernorm.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▉                                 | 521/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████▉                                 | 521/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████▎                                   | 522/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.kv_b_proj.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████▎                                   | 522/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.kv_b_proj.weight]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▌                                    | 523/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.o_proj.weight]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▌                                    | 523/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.o_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████▊                                  | 524/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.q_a_layernorm.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████▊                                  | 524/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.q_a_layernorm.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▍                                   | 525/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.q_a_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▍                                   | 525/751 [01:54<00:43,  5.25it/s, Materializing param=model.layers.32.self_attn.q_a_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▋                                   | 526/751 [01:54<00:42,  5.25it/s, Materializing param=model.layers.32.self_attn.q_b_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████▋                                   | 526/751 [01:54<00:42,  5.25it/s, Materializing param=model.layers.32.self_attn.q_b_proj.weight]Loading weights:  70%|████████████████████████████████████████████████████████████████████████████████████▉                                    | 527/751 [01:54<00:42,  5.25it/s, Materializing param=model.layers.33.input_layernorm.weight]Loading weights:  70%|████████████████████████████████████████████████████████████████████████████████████▉                                    | 527/751 [01:54<00:42,  5.25it/s, Materializing param=model.layers.33.input_layernorm.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████████▊                                    | 528/751 [01:54<00:42,  5.25it/s, Materializing param=model.layers.33.mlp.experts.down_proj]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████████▊                                    | 528/751 [01:54<00:42,  5.25it/s, Materializing param=model.layers.33.mlp.experts.down_proj]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████████▉                                    | 529/751 [01:55<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.experts.down_proj]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████████▉                                    | 529/751 [01:55<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.experts.down_proj]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▊                                   | 529/751 [01:55<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.experts.gate_up_proj]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▊                                   | 529/751 [01:55<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.experts.gate_up_proj]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▊                                   | 529/751 [01:55<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.experts.gate_up_proj]Loading weights:  70%|███████████████████████████████████████████████████████████████████████████████████▊                                   | 529/751 [01:55<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.experts.gate_up_proj]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▎                                | 530/751 [01:57<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.gate.e_score_correction_bias]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▎                                | 530/751 [01:57<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.gate.e_score_correction_bias]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▍                                | 531/751 [01:57<00:38,  5.77it/s, Materializing param=model.layers.33.mlp.gate.e_score_correction_bias]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████████████████▌                                     | 531/751 [01:57<00:38,  5.77it/s, Materializing param=model.layers.33.mlp.gate.weight]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████████████████▌                                     | 531/751 [01:57<00:38,  5.77it/s, Materializing param=model.layers.33.mlp.gate.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▌                               | 532/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.mlp.shared_experts.down_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▌                               | 532/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.mlp.shared_experts.down_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▋                               | 533/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.mlp.shared_experts.gate_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▋                               | 533/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.mlp.shared_experts.gate_proj.weight]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▏                               | 534/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.mlp.shared_experts.up_proj.weight]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▏                               | 534/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.mlp.shared_experts.up_proj.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▊                                | 535/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.post_attention_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▊                                | 535/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.post_attention_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▉                                | 536/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.self_attn.kv_a_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▉                                | 536/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.self_attn.kv_a_layernorm.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████▏                              | 537/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████▏                              | 537/751 [01:57<00:37,  5.77it/s, Materializing param=model.layers.33.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████▊                                 | 538/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.kv_b_proj.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████▊                                 | 538/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.kv_b_proj.weight]Loading weights:  72%|██████████████████████████████████████████████████████████████████████████████████████▏                                 | 539/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.o_proj.weight]Loading weights:  72%|██████████████████████████████████████████████████████████████████████████████████████▏                                 | 539/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.o_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████▎                               | 540/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.q_a_layernorm.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████▎                               | 540/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.q_a_layernorm.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████                                 | 541/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.q_a_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████                                 | 541/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.q_a_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 542/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.q_b_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 542/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.33.self_attn.q_b_proj.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████████▍                                 | 543/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.34.input_layernorm.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████████▍                                 | 543/751 [01:57<00:36,  5.77it/s, Materializing param=model.layers.34.input_layernorm.weight]Loading weights:  72%|████████████████████████████████████████████████████████████████████████████████████████▎                                 | 544/751 [01:57<00:35,  5.77it/s, Materializing param=model.layers.34.mlp.experts.down_proj]Loading weights:  72%|████████████████████████████████████████████████████████████████████████████████████████▎                                 | 544/751 [01:57<00:35,  5.77it/s, Materializing param=model.layers.34.mlp.experts.down_proj]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▎                                | 530/751 [01:56<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.gate.e_score_correction_bias]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▎                                | 530/751 [01:56<00:29,  7.56it/s, Materializing param=model.layers.33.mlp.gate.e_score_correction_bias]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▍                                | 531/751 [01:56<00:38,  5.76it/s, Materializing param=model.layers.33.mlp.gate.e_score_correction_bias]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████████████████▌                                     | 531/751 [01:56<00:38,  5.76it/s, Materializing param=model.layers.33.mlp.gate.weight]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████████████████▌                                     | 531/751 [01:56<00:38,  5.76it/s, Materializing param=model.layers.33.mlp.gate.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▌                               | 532/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.mlp.shared_experts.down_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▌                               | 532/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.mlp.shared_experts.down_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▋                               | 533/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.mlp.shared_experts.gate_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████▋                               | 533/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.mlp.shared_experts.gate_proj.weight]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▏                               | 534/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.mlp.shared_experts.up_proj.weight]Loading weights:  71%|██████████████████████████████████████████████████████████████████████████████▏                               | 534/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.mlp.shared_experts.up_proj.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▊                                | 535/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.post_attention_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▊                                | 535/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.post_attention_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▉                                | 536/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.self_attn.kv_a_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████▉                                | 536/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.self_attn.kv_a_layernorm.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████▏                              | 537/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████▏                              | 537/751 [01:56<00:37,  5.76it/s, Materializing param=model.layers.33.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████▊                                 | 538/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.kv_b_proj.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████▊                                 | 538/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.kv_b_proj.weight]Loading weights:  72%|██████████████████████████████████████████████████████████████████████████████████████▏                                 | 539/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.o_proj.weight]Loading weights:  72%|██████████████████████████████████████████████████████████████████████████████████████▏                                 | 539/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.o_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████▎                               | 540/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.q_a_layernorm.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████▎                               | 540/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.q_a_layernorm.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████                                 | 541/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.q_a_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████                                 | 541/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.q_a_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 542/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.q_b_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████▏                                | 542/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.33.self_attn.q_b_proj.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████████▍                                 | 543/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.34.input_layernorm.weight]Loading weights:  72%|███████████████████████████████████████████████████████████████████████████████████████▍                                 | 543/751 [01:56<00:36,  5.76it/s, Materializing param=model.layers.34.input_layernorm.weight]Loading weights:  72%|████████████████████████████████████████████████████████████████████████████████████████▎                                 | 544/751 [01:56<00:35,  5.76it/s, Materializing param=model.layers.34.mlp.experts.down_proj]Loading weights:  72%|████████████████████████████████████████████████████████████████████████████████████████▎                                 | 544/751 [01:56<00:35,  5.76it/s, Materializing param=model.layers.34.mlp.experts.down_proj]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████████████▌                                 | 545/751 [01:57<00:23,  8.82it/s, Materializing param=model.layers.34.mlp.experts.down_proj]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████████████▌                                 | 545/751 [01:57<00:23,  8.81it/s, Materializing param=model.layers.34.mlp.experts.down_proj]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████████▎                                | 545/751 [01:57<00:23,  8.82it/s, Materializing param=model.layers.34.mlp.experts.gate_up_proj]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████████▎                                | 545/751 [01:57<00:23,  8.81it/s, Materializing param=model.layers.34.mlp.experts.gate_up_proj]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████████▎                                | 545/751 [01:57<00:23,  8.82it/s, Materializing param=model.layers.34.mlp.experts.gate_up_proj]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████████▎                                | 545/751 [01:57<00:23,  8.81it/s, Materializing param=model.layers.34.mlp.experts.gate_up_proj]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▋                              | 546/751 [01:58<00:23,  8.82it/s, Materializing param=model.layers.34.mlp.gate.e_score_correction_bias]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▋                              | 546/751 [01:58<00:23,  8.82it/s, Materializing param=model.layers.34.mlp.gate.e_score_correction_bias]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▊                              | 547/751 [01:58<00:33,  6.17it/s, Materializing param=model.layers.34.mlp.gate.e_score_correction_bias]Loading weights:  73%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 547/751 [01:58<00:33,  6.17it/s, Materializing param=model.layers.34.mlp.gate.weight]Loading weights:  73%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 547/751 [01:58<00:33,  6.17it/s, Materializing param=model.layers.34.mlp.gate.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▊                             | 548/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.mlp.shared_experts.down_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▊                             | 548/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.mlp.shared_experts.down_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▉                             | 549/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.mlp.shared_experts.gate_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▉                             | 549/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.mlp.shared_experts.gate_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▌                             | 550/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.mlp.shared_experts.up_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▌                             | 550/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.mlp.shared_experts.up_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████▏                             | 551/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.post_attention_layernorm.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████▏                             | 551/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.post_attention_layernorm.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████▎                             | 552/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.self_attn.kv_a_layernorm.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████▎                             | 552/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.self_attn.kv_a_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████▌                            | 553/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████▌                            | 553/751 [01:58<00:32,  6.17it/s, Materializing param=model.layers.34.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████▎                              | 554/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.kv_b_proj.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████▎                              | 554/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.kv_b_proj.weight]Loading weights:  74%|████████████████████████████████████████████████████████████████████████████████████████▋                               | 555/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.o_proj.weight]Loading weights:  74%|████████████████████████████████████████████████████████████████████████████████████████▋                               | 555/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.o_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████▋                             | 556/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.q_a_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████▋                             | 556/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.q_a_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▌                              | 557/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.q_a_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▌                              | 557/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.q_a_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▋                              | 558/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.q_b_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▋                              | 558/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.34.self_attn.q_b_proj.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████████                               | 559/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.35.input_layernorm.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████████                               | 559/751 [01:58<00:31,  6.17it/s, Materializing param=model.layers.35.input_layernorm.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████████████▉                               | 560/751 [01:58<00:30,  6.17it/s, Materializing param=model.layers.35.mlp.experts.down_proj]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████████████▉                               | 560/751 [01:58<00:30,  6.17it/s, Materializing param=model.layers.35.mlp.experts.down_proj]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▋                              | 546/751 [02:00<00:23,  8.81it/s, Materializing param=model.layers.34.mlp.gate.e_score_correction_bias]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▋                              | 546/751 [02:00<00:23,  8.81it/s, Materializing param=model.layers.34.mlp.gate.e_score_correction_bias]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▊                              | 547/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.gate.e_score_correction_bias]Loading weights:  73%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 547/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.gate.weight]Loading weights:  73%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 547/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.gate.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▊                             | 548/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.shared_experts.down_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▊                             | 548/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.shared_experts.down_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▉                             | 549/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.shared_experts.gate_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████▉                             | 549/751 [02:00<00:40,  5.05it/s, Materializing param=model.layers.34.mlp.shared_experts.gate_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▌                             | 550/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.mlp.shared_experts.up_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████▌                             | 550/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.mlp.shared_experts.up_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████▏                             | 551/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.post_attention_layernorm.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████▏                             | 551/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.post_attention_layernorm.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████▎                             | 552/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.self_attn.kv_a_layernorm.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████▎                             | 552/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.self_attn.kv_a_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████▌                            | 553/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████▌                            | 553/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████▎                              | 554/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.self_attn.kv_b_proj.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████▎                              | 554/751 [02:00<00:39,  5.05it/s, Materializing param=model.layers.34.self_attn.kv_b_proj.weight]Loading weights:  74%|████████████████████████████████████████████████████████████████████████████████████████▋                               | 555/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.o_proj.weight]Loading weights:  74%|████████████████████████████████████████████████████████████████████████████████████████▋                               | 555/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.o_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████▋                             | 556/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.q_a_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████▋                             | 556/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.q_a_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▌                              | 557/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.q_a_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▌                              | 557/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.q_a_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▋                              | 558/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.q_b_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████▋                              | 558/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.34.self_attn.q_b_proj.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████████                               | 559/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.35.input_layernorm.weight]Loading weights:  74%|██████████████████████████████████████████████████████████████████████████████████████████                               | 559/751 [02:00<00:38,  5.05it/s, Materializing param=model.layers.35.input_layernorm.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████████████▉                               | 560/751 [02:00<00:37,  5.05it/s, Materializing param=model.layers.35.mlp.experts.down_proj]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████████████▉                               | 560/751 [02:00<00:37,  5.05it/s, Materializing param=model.layers.35.mlp.experts.down_proj]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████████████▏                              | 561/751 [02:00<00:25,  7.40it/s, Materializing param=model.layers.35.mlp.experts.down_proj]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████████████▏                              | 561/751 [02:00<00:24,  7.88it/s, Materializing param=model.layers.35.mlp.experts.down_proj]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████▉                              | 561/751 [02:00<00:25,  7.40it/s, Materializing param=model.layers.35.mlp.experts.gate_up_proj]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████▉                              | 561/751 [02:00<00:24,  7.88it/s, Materializing param=model.layers.35.mlp.experts.gate_up_proj]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████▉                              | 561/751 [02:00<00:25,  7.40it/s, Materializing param=model.layers.35.mlp.experts.gate_up_proj]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████▉                              | 561/751 [02:00<00:24,  7.88it/s, Materializing param=model.layers.35.mlp.experts.gate_up_proj]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████                            | 562/751 [02:03<00:23,  7.88it/s, Materializing param=model.layers.35.mlp.gate.e_score_correction_bias]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████                            | 562/751 [02:03<00:23,  7.88it/s, Materializing param=model.layers.35.mlp.gate.e_score_correction_bias]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████▏                           | 563/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.gate.e_score_correction_bias]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                | 563/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.gate.weight]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                | 563/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.gate.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████                           | 564/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.shared_experts.down_proj.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████                           | 564/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.shared_experts.down_proj.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████▎                          | 565/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.shared_experts.gate_proj.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████▎                          | 565/751 [02:03<00:46,  4.03it/s, Materializing param=model.layers.35.mlp.shared_experts.gate_proj.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████▉                           | 566/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.mlp.shared_experts.up_proj.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████▉                           | 566/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.mlp.shared_experts.up_proj.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████▌                           | 567/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.post_attention_layernorm.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████▌                           | 567/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.post_attention_layernorm.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 568/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.self_attn.kv_a_layernorm.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 568/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.self_attn.kv_a_layernorm.weight]Loading weights:  76%|█████████████████████████████████████████████████████████████████████████████████▊                          | 569/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  76%|█████████████████████████████████████████████████████████████████████████████████▊                          | 569/751 [02:03<00:45,  4.03it/s, Materializing param=model.layers.35.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████▊                            | 570/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.kv_b_proj.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████▊                            | 570/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.kv_b_proj.weight]Loading weights:  76%|███████████████████████████████████████████████████████████████████████████████████████████▏                            | 571/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.o_proj.weight]Loading weights:  76%|███████████████████████████████████████████████████████████████████████████████████████████▏                            | 571/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.o_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████                           | 572/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.q_a_layernorm.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████                           | 572/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.q_a_layernorm.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████                            | 573/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.q_a_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████                            | 573/751 [02:03<00:44,  4.03it/s, Materializing param=model.layers.35.self_attn.q_a_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████▏                           | 574/751 [02:03<00:43,  4.03it/s, Materializing param=model.layers.35.self_attn.q_b_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████▏                           | 574/751 [02:03<00:43,  4.03it/s, Materializing param=model.layers.35.self_attn.q_b_proj.weight]Loading weights:  77%|████████████████████████████████████████████████████████████████████████████████████████████▋                            | 575/751 [02:03<00:43,  4.03it/s, Materializing param=model.layers.36.input_layernorm.weight]Loading weights:  77%|████████████████████████████████████████████████████████████████████████████████████████████▋                            | 575/751 [02:03<00:43,  4.03it/s, Materializing param=model.layers.36.input_layernorm.weight]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▌                            | 576/751 [02:03<00:43,  4.03it/s, Materializing param=model.layers.36.mlp.experts.down_proj]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▌                            | 576/751 [02:03<00:43,  4.03it/s, Materializing param=model.layers.36.mlp.experts.down_proj]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████████████                              | 562/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.experts.gate_up_proj]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████                            | 562/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.gate.e_score_correction_bias]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████                            | 562/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.gate.e_score_correction_bias]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                | 563/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.gate.weight]Loading weights:  75%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                | 563/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.gate.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████                           | 564/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.shared_experts.down_proj.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████                           | 564/751 [02:03<00:50,  3.74it/s, Materializing param=model.layers.35.mlp.shared_experts.down_proj.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████▎                          | 565/751 [02:03<00:49,  3.74it/s, Materializing param=model.layers.35.mlp.shared_experts.gate_proj.weight]Loading weights:  75%|█████████████████████████████████████████████████████████████████████████████████▎                          | 565/751 [02:03<00:49,  3.74it/s, Materializing param=model.layers.35.mlp.shared_experts.gate_proj.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████▉                           | 566/751 [02:03<00:49,  3.74it/s, Materializing param=model.layers.35.mlp.shared_experts.up_proj.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████▉                           | 566/751 [02:03<00:49,  3.74it/s, Materializing param=model.layers.35.mlp.shared_experts.up_proj.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████▌                           | 567/751 [02:03<00:49,  3.74it/s, Materializing param=model.layers.35.post_attention_layernorm.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████▌                           | 567/751 [02:03<00:49,  3.74it/s, Materializing param=model.layers.35.post_attention_layernorm.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 568/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.kv_a_layernorm.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████▋                           | 568/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.kv_a_layernorm.weight]Loading weights:  76%|█████████████████████████████████████████████████████████████████████████████████▊                          | 569/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  76%|█████████████████████████████████████████████████████████████████████████████████▊                          | 569/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████▊                            | 570/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.kv_b_proj.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████▊                            | 570/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.kv_b_proj.weight]Loading weights:  76%|███████████████████████████████████████████████████████████████████████████████████████████▏                            | 571/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.o_proj.weight]Loading weights:  76%|███████████████████████████████████████████████████████████████████████████████████████████▏                            | 571/751 [02:03<00:48,  3.74it/s, Materializing param=model.layers.35.self_attn.o_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████                           | 572/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.35.self_attn.q_a_layernorm.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████                           | 572/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.35.self_attn.q_a_layernorm.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████                            | 573/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.35.self_attn.q_a_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████                            | 573/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.35.self_attn.q_a_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████▏                           | 574/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.35.self_attn.q_b_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████▏                           | 574/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.35.self_attn.q_b_proj.weight]Loading weights:  77%|████████████████████████████████████████████████████████████████████████████████████████████▋                            | 575/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.36.input_layernorm.weight]Loading weights:  77%|████████████████████████████████████████████████████████████████████████████████████████████▋                            | 575/751 [02:03<00:47,  3.74it/s, Materializing param=model.layers.36.input_layernorm.weight]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▌                            | 576/751 [02:03<00:46,  3.74it/s, Materializing param=model.layers.36.mlp.experts.down_proj]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▌                            | 576/751 [02:03<00:46,  3.74it/s, Materializing param=model.layers.36.mlp.experts.down_proj]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▋                            | 577/751 [02:04<00:31,  5.46it/s, Materializing param=model.layers.36.mlp.experts.down_proj]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▋                            | 577/751 [02:05<00:31,  5.52it/s, Materializing param=model.layers.36.mlp.experts.down_proj]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████▍                           | 577/751 [02:04<00:31,  5.46it/s, Materializing param=model.layers.36.mlp.experts.gate_up_proj]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████▍                           | 577/751 [02:05<00:31,  5.52it/s, Materializing param=model.layers.36.mlp.experts.gate_up_proj]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████▍                           | 577/751 [02:04<00:31,  5.46it/s, Materializing param=model.layers.36.mlp.experts.gate_up_proj]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████▍                           | 577/751 [02:05<00:31,  5.52it/s, Materializing param=model.layers.36.mlp.experts.gate_up_proj]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████▌                           | 578/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.experts.gate_up_proj]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▍                         | 578/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.gate.e_score_correction_bias]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▍                         | 578/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.gate.e_score_correction_bias]Loading weights:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 579/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.gate.weight]Loading weights:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 579/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.gate.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▍                        | 580/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.shared_experts.down_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▍                        | 580/751 [02:08<00:54,  3.16it/s, Materializing param=model.layers.36.mlp.shared_experts.down_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▌                        | 581/751 [02:08<00:53,  3.16it/s, Materializing param=model.layers.36.mlp.shared_experts.gate_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▌                        | 581/751 [02:08<00:53,  3.16it/s, Materializing param=model.layers.36.mlp.shared_experts.gate_proj.weight]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▏                        | 582/751 [02:08<00:53,  3.16it/s, Materializing param=model.layers.36.mlp.shared_experts.up_proj.weight]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▏                        | 582/751 [02:08<00:53,  3.16it/s, Materializing param=model.layers.36.mlp.shared_experts.up_proj.weight]Loading weights:  78%|██████████████████████████████████████████████████████████████████████████████████████▉                         | 583/751 [02:08<00:53,  3.16it/s, Materializing param=model.layers.36.post_attention_layernorm.weight]Loading weights:  78%|██████████████████████████████████████████████████████████████████████████████████████▉                         | 583/751 [02:08<00:53,  3.16it/s, Materializing param=model.layers.36.post_attention_layernorm.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████                         | 584/751 [02:08<00:52,  3.16it/s, Materializing param=model.layers.36.self_attn.kv_a_layernorm.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████                         | 584/751 [02:08<00:52,  3.16it/s, Materializing param=model.layers.36.self_attn.kv_a_layernorm.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 585/751 [02:08<00:52,  3.16it/s, Materializing param=model.layers.36.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 585/751 [02:08<00:52,  3.16it/s, Materializing param=model.layers.36.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████████▎                         | 586/751 [02:08<00:52,  3.16it/s, Materializing param=model.layers.36.self_attn.kv_b_proj.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████████▎                         | 586/751 [02:08<00:52,  3.16it/s, Materializing param=model.layers.36.self_attn.kv_b_proj.weight]Loading weights:  78%|█████████████████████████████████████████████████████████████████████████████████████████████▊                          | 587/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.o_proj.weight]Loading weights:  78%|█████████████████████████████████████████████████████████████████████████████████████████████▊                          | 587/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.o_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████▍                        | 588/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.q_a_layernorm.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████▍                        | 588/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.q_a_layernorm.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████▌                         | 589/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.q_a_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████▌                         | 589/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.q_a_proj.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████▋                         | 590/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.q_b_proj.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████▋                         | 590/751 [02:08<00:51,  3.16it/s, Materializing param=model.layers.36.self_attn.q_b_proj.weight]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████████████▏                         | 591/751 [02:08<00:50,  3.16it/s, Materializing param=model.layers.37.input_layernorm.weight]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████████████▏                         | 591/751 [02:08<00:50,  3.16it/s, Materializing param=model.layers.37.input_layernorm.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 592/751 [02:08<00:50,  3.16it/s, Materializing param=model.layers.37.mlp.experts.down_proj]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 592/751 [02:08<00:50,  3.16it/s, Materializing param=model.layers.37.mlp.experts.down_proj]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████▌                           | 578/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.experts.gate_up_proj]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▍                         | 578/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.gate.e_score_correction_bias]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▍                         | 578/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.gate.e_score_correction_bias]Loading weights:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 579/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.gate.weight]Loading weights:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 579/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.gate.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▍                        | 580/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.shared_experts.down_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▍                        | 580/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.shared_experts.down_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▌                        | 581/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.shared_experts.gate_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████▌                        | 581/751 [02:08<00:54,  3.15it/s, Materializing param=model.layers.36.mlp.shared_experts.gate_proj.weight]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▏                        | 582/751 [02:08<00:53,  3.15it/s, Materializing param=model.layers.36.mlp.shared_experts.up_proj.weight]Loading weights:  77%|█████████████████████████████████████████████████████████████████████████████████████▏                        | 582/751 [02:08<00:53,  3.15it/s, Materializing param=model.layers.36.mlp.shared_experts.up_proj.weight]Loading weights:  78%|██████████████████████████████████████████████████████████████████████████████████████▉                         | 583/751 [02:08<00:53,  3.15it/s, Materializing param=model.layers.36.post_attention_layernorm.weight]Loading weights:  78%|██████████████████████████████████████████████████████████████████████████████████████▉                         | 583/751 [02:08<00:53,  3.15it/s, Materializing param=model.layers.36.post_attention_layernorm.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████                         | 584/751 [02:08<00:53,  3.15it/s, Materializing param=model.layers.36.self_attn.kv_a_layernorm.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████                         | 584/751 [02:08<00:53,  3.15it/s, Materializing param=model.layers.36.self_attn.kv_a_layernorm.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 585/751 [02:08<00:52,  3.15it/s, Materializing param=model.layers.36.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 585/751 [02:08<00:52,  3.15it/s, Materializing param=model.layers.36.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████████▎                         | 586/751 [02:08<00:52,  3.15it/s, Materializing param=model.layers.36.self_attn.kv_b_proj.weight]Loading weights:  78%|███████████████████████████████████████████████████████████████████████████████████████████▎                         | 586/751 [02:08<00:52,  3.15it/s, Materializing param=model.layers.36.self_attn.kv_b_proj.weight]Loading weights:  78%|█████████████████████████████████████████████████████████████████████████████████████████████▊                          | 587/751 [02:08<00:52,  3.15it/s, Materializing param=model.layers.36.self_attn.o_proj.weight]Loading weights:  78%|█████████████████████████████████████████████████████████████████████████████████████████████▊                          | 587/751 [02:08<00:52,  3.15it/s, Materializing param=model.layers.36.self_attn.o_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████▍                        | 588/751 [02:08<00:51,  3.15it/s, Materializing param=model.layers.36.self_attn.q_a_layernorm.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████▍                        | 588/751 [02:08<00:51,  3.15it/s, Materializing param=model.layers.36.self_attn.q_a_layernorm.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████▌                         | 589/751 [02:08<00:51,  3.15it/s, Materializing param=model.layers.36.self_attn.q_a_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████▌                         | 589/751 [02:08<00:51,  3.15it/s, Materializing param=model.layers.36.self_attn.q_a_proj.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████▋                         | 590/751 [02:08<00:51,  3.15it/s, Materializing param=model.layers.36.self_attn.q_b_proj.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████▋                         | 590/751 [02:08<00:51,  3.15it/s, Materializing param=model.layers.36.self_attn.q_b_proj.weight]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████████████▏                         | 591/751 [02:08<00:50,  3.15it/s, Materializing param=model.layers.37.input_layernorm.weight]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████████████▏                         | 591/751 [02:08<00:50,  3.15it/s, Materializing param=model.layers.37.input_layernorm.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 592/751 [02:08<00:50,  3.15it/s, Materializing param=model.layers.37.mlp.experts.down_proj]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 592/751 [02:08<00:50,  3.15it/s, Materializing param=model.layers.37.mlp.experts.down_proj]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 593/751 [02:09<00:33,  4.73it/s, Materializing param=model.layers.37.mlp.experts.down_proj]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 593/751 [02:09<00:33,  4.75it/s, Materializing param=model.layers.37.mlp.experts.down_proj]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████▉                         | 593/751 [02:09<00:33,  4.73it/s, Materializing param=model.layers.37.mlp.experts.gate_up_proj]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████▉                         | 593/751 [02:09<00:33,  4.75it/s, Materializing param=model.layers.37.mlp.experts.gate_up_proj]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████▉                         | 593/751 [02:09<00:33,  4.73it/s, Materializing param=model.layers.37.mlp.experts.gate_up_proj]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████▉                         | 593/751 [02:09<00:33,  4.75it/s, Materializing param=model.layers.37.mlp.experts.gate_up_proj]Loading weights:  79%|██████████████████████████████████████████████████████████████████████████████████████████████                         | 594/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.experts.gate_up_proj]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████▊                       | 594/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.gate.e_score_correction_bias]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████▊                       | 594/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.gate.e_score_correction_bias]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 595/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.gate.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 595/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.gate.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▋                      | 596/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.shared_experts.down_proj.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▋                      | 596/751 [02:12<00:53,  2.92it/s, Materializing param=model.layers.37.mlp.shared_experts.down_proj.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 597/751 [02:12<00:52,  2.92it/s, Materializing param=model.layers.37.mlp.shared_experts.gate_proj.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 597/751 [02:12<00:52,  2.92it/s, Materializing param=model.layers.37.mlp.shared_experts.gate_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████▌                      | 598/751 [02:12<00:52,  2.92it/s, Materializing param=model.layers.37.mlp.shared_experts.up_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████▌                      | 598/751 [02:12<00:52,  2.92it/s, Materializing param=model.layers.37.mlp.shared_experts.up_proj.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▎                      | 599/751 [02:12<00:52,  2.92it/s, Materializing param=model.layers.37.post_attention_layernorm.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▎                      | 599/751 [02:12<00:52,  2.92it/s, Materializing param=model.layers.37.post_attention_layernorm.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▍                      | 600/751 [02:12<00:51,  2.92it/s, Materializing param=model.layers.37.self_attn.kv_a_layernorm.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▍                      | 600/751 [02:12<00:51,  2.92it/s, Materializing param=model.layers.37.self_attn.kv_a_layernorm.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████▍                     | 601/751 [02:12<00:51,  2.92it/s, Materializing param=model.layers.37.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████▍                     | 601/751 [02:12<00:51,  2.92it/s, Materializing param=model.layers.37.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████████▊                       | 602/751 [02:12<00:51,  2.92it/s, Materializing param=model.layers.37.self_attn.kv_b_proj.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████████▊                       | 602/751 [02:12<00:51,  2.92it/s, Materializing param=model.layers.37.self_attn.kv_b_proj.weight]Loading weights:  80%|████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 603/751 [02:12<00:50,  2.92it/s, Materializing param=model.layers.37.self_attn.o_proj.weight]Loading weights:  80%|████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 603/751 [02:12<00:50,  2.92it/s, Materializing param=model.layers.37.self_attn.o_proj.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████████▉                      | 604/751 [02:12<00:50,  2.92it/s, Materializing param=model.layers.37.self_attn.q_a_layernorm.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████████▉                      | 604/751 [02:13<00:50,  2.92it/s, Materializing param=model.layers.37.self_attn.q_a_layernorm.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████                       | 605/751 [02:13<00:50,  2.92it/s, Materializing param=model.layers.37.self_attn.q_a_proj.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████                       | 605/751 [02:13<00:50,  2.92it/s, Materializing param=model.layers.37.self_attn.q_a_proj.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████▏                      | 606/751 [02:13<00:49,  2.92it/s, Materializing param=model.layers.37.self_attn.q_b_proj.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████▏                      | 606/751 [02:13<00:49,  2.92it/s, Materializing param=model.layers.37.self_attn.q_b_proj.weight]Loading weights:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 607/751 [02:13<00:49,  2.92it/s, Materializing param=model.layers.38.input_layernorm.weight]Loading weights:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 607/751 [02:13<00:49,  2.92it/s, Materializing param=model.layers.38.input_layernorm.weight]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 608/751 [02:13<00:48,  2.92it/s, Materializing param=model.layers.38.mlp.experts.down_proj]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 608/751 [02:13<00:48,  2.92it/s, Materializing param=model.layers.38.mlp.experts.down_proj]Loading weights:  79%|██████████████████████████████████████████████████████████████████████████████████████████████                         | 594/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.experts.gate_up_proj]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████▊                       | 594/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.gate.e_score_correction_bias]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████▊                       | 594/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.gate.e_score_correction_bias]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 595/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.gate.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 595/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.gate.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▋                      | 596/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.shared_experts.down_proj.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▋                      | 596/751 [02:12<00:53,  2.91it/s, Materializing param=model.layers.37.mlp.shared_experts.down_proj.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 597/751 [02:12<00:52,  2.91it/s, Materializing param=model.layers.37.mlp.shared_experts.gate_proj.weight]Loading weights:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 597/751 [02:12<00:52,  2.91it/s, Materializing param=model.layers.37.mlp.shared_experts.gate_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████▌                      | 598/751 [02:12<00:52,  2.91it/s, Materializing param=model.layers.37.mlp.shared_experts.up_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████▌                      | 598/751 [02:12<00:52,  2.91it/s, Materializing param=model.layers.37.mlp.shared_experts.up_proj.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▎                      | 599/751 [02:12<00:52,  2.91it/s, Materializing param=model.layers.37.post_attention_layernorm.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▎                      | 599/751 [02:12<00:52,  2.91it/s, Materializing param=model.layers.37.post_attention_layernorm.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▍                      | 600/751 [02:12<00:51,  2.91it/s, Materializing param=model.layers.37.self_attn.kv_a_layernorm.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████▍                      | 600/751 [02:12<00:51,  2.91it/s, Materializing param=model.layers.37.self_attn.kv_a_layernorm.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████▍                     | 601/751 [02:12<00:51,  2.91it/s, Materializing param=model.layers.37.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████▍                     | 601/751 [02:12<00:51,  2.91it/s, Materializing param=model.layers.37.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████████▊                       | 602/751 [02:12<00:51,  2.91it/s, Materializing param=model.layers.37.self_attn.kv_b_proj.weight]Loading weights:  80%|█████████████████████████████████████████████████████████████████████████████████████████████▊                       | 602/751 [02:12<00:51,  2.91it/s, Materializing param=model.layers.37.self_attn.kv_b_proj.weight]Loading weights:  80%|████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 603/751 [02:12<00:50,  2.91it/s, Materializing param=model.layers.37.self_attn.o_proj.weight]Loading weights:  80%|████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 603/751 [02:12<00:50,  2.91it/s, Materializing param=model.layers.37.self_attn.o_proj.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████████▉                      | 604/751 [02:12<00:50,  2.91it/s, Materializing param=model.layers.37.self_attn.q_a_layernorm.weight]Loading weights:  80%|██████████████████████████████████████████████████████████████████████████████████████████▉                      | 604/751 [02:12<00:50,  2.91it/s, Materializing param=model.layers.37.self_attn.q_a_layernorm.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████                       | 605/751 [02:12<00:50,  2.91it/s, Materializing param=model.layers.37.self_attn.q_a_proj.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████                       | 605/751 [02:12<00:50,  2.91it/s, Materializing param=model.layers.37.self_attn.q_a_proj.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████▏                      | 606/751 [02:12<00:49,  2.91it/s, Materializing param=model.layers.37.self_attn.q_b_proj.weight]Loading weights:  81%|███████████████████████████████████████████████████████████████████████████████████████████████▏                      | 606/751 [02:12<00:49,  2.91it/s, Materializing param=model.layers.37.self_attn.q_b_proj.weight]Loading weights:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 607/751 [02:12<00:49,  2.91it/s, Materializing param=model.layers.38.input_layernorm.weight]Loading weights:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 607/751 [02:12<00:49,  2.91it/s, Materializing param=model.layers.38.input_layernorm.weight]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 608/751 [02:12<00:49,  2.91it/s, Materializing param=model.layers.38.mlp.experts.down_proj]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 608/751 [02:12<00:49,  2.91it/s, Materializing param=model.layers.38.mlp.experts.down_proj]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 609/751 [02:14<00:31,  4.44it/s, Materializing param=model.layers.38.mlp.experts.down_proj]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 609/751 [02:14<00:32,  4.43it/s, Materializing param=model.layers.38.mlp.experts.down_proj]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 609/751 [02:14<00:32,  4.43it/s, Materializing param=model.layers.38.mlp.experts.gate_up_proj]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 609/751 [02:14<00:31,  4.44it/s, Materializing param=model.layers.38.mlp.experts.gate_up_proj]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 609/751 [02:14<00:32,  4.43it/s, Materializing param=model.layers.38.mlp.experts.gate_up_proj]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 609/751 [02:14<00:31,  4.44it/s, Materializing param=model.layers.38.mlp.experts.gate_up_proj]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 610/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.experts.gate_up_proj]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████▏                    | 610/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.gate.e_score_correction_bias]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████▏                    | 610/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.gate.e_score_correction_bias]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 611/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.gate.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 611/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.gate.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████                    | 612/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.shared_experts.down_proj.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████                    | 612/751 [02:17<00:50,  2.77it/s, Materializing param=model.layers.38.mlp.shared_experts.down_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▏                   | 613/751 [02:17<00:49,  2.77it/s, Materializing param=model.layers.38.mlp.shared_experts.gate_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▏                   | 613/751 [02:17<00:49,  2.77it/s, Materializing param=model.layers.38.mlp.shared_experts.gate_proj.weight]Loading weights:  82%|█████████████████████████████████████████████████████████████████████████████████████████▉                    | 614/751 [02:17<00:49,  2.77it/s, Materializing param=model.layers.38.mlp.shared_experts.up_proj.weight]Loading weights:  82%|█████████████████████████████████████████████████████████████████████████████████████████▉                    | 614/751 [02:17<00:49,  2.77it/s, Materializing param=model.layers.38.mlp.shared_experts.up_proj.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▋                    | 615/751 [02:17<00:49,  2.77it/s, Materializing param=model.layers.38.post_attention_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▋                    | 615/751 [02:17<00:49,  2.77it/s, Materializing param=model.layers.38.post_attention_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▊                    | 616/751 [02:17<00:48,  2.77it/s, Materializing param=model.layers.38.self_attn.kv_a_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▊                    | 616/751 [02:17<00:48,  2.77it/s, Materializing param=model.layers.38.self_attn.kv_a_layernorm.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▋                   | 617/751 [02:17<00:48,  2.77it/s, Materializing param=model.layers.38.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▋                   | 617/751 [02:17<00:48,  2.77it/s, Materializing param=model.layers.38.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 618/751 [02:17<00:48,  2.77it/s, Materializing param=model.layers.38.self_attn.kv_b_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 618/751 [02:17<00:48,  2.77it/s, Materializing param=model.layers.38.self_attn.kv_b_proj.weight]Loading weights:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 619/751 [02:17<00:47,  2.77it/s, Materializing param=model.layers.38.self_attn.o_proj.weight]Loading weights:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 619/751 [02:17<00:47,  2.77it/s, Materializing param=model.layers.38.self_attn.o_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                   | 620/751 [02:17<00:47,  2.77it/s, Materializing param=model.layers.38.self_attn.q_a_layernorm.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                   | 620/751 [02:17<00:47,  2.77it/s, Materializing param=model.layers.38.self_attn.q_a_layernorm.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 621/751 [02:17<00:46,  2.77it/s, Materializing param=model.layers.38.self_attn.q_a_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 621/751 [02:17<00:46,  2.77it/s, Materializing param=model.layers.38.self_attn.q_a_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 622/751 [02:17<00:46,  2.77it/s, Materializing param=model.layers.38.self_attn.q_b_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 622/751 [02:17<00:46,  2.77it/s, Materializing param=model.layers.38.self_attn.q_b_proj.weight]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 623/751 [02:17<00:46,  2.77it/s, Materializing param=model.layers.39.input_layernorm.weight]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 623/751 [02:17<00:46,  2.77it/s, Materializing param=model.layers.39.input_layernorm.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 624/751 [02:17<00:45,  2.77it/s, Materializing param=model.layers.39.mlp.experts.down_proj]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 624/751 [02:17<00:45,  2.77it/s, Materializing param=model.layers.39.mlp.experts.down_proj]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 610/751 [02:17<00:51,  2.76it/s, Materializing param=model.layers.38.mlp.experts.gate_up_proj]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████▏                    | 610/751 [02:17<00:51,  2.76it/s, Materializing param=model.layers.38.mlp.gate.e_score_correction_bias]Loading weights:  81%|██████████████████████████████████████████████████████████████████████████████████████████▏                    | 610/751 [02:17<00:51,  2.76it/s, Materializing param=model.layers.38.mlp.gate.e_score_correction_bias]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 611/751 [02:17<00:50,  2.76it/s, Materializing param=model.layers.38.mlp.gate.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 611/751 [02:17<00:50,  2.76it/s, Materializing param=model.layers.38.mlp.gate.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████                    | 612/751 [02:17<00:50,  2.76it/s, Materializing param=model.layers.38.mlp.shared_experts.down_proj.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████                    | 612/751 [02:17<00:50,  2.76it/s, Materializing param=model.layers.38.mlp.shared_experts.down_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▏                   | 613/751 [02:17<00:49,  2.76it/s, Materializing param=model.layers.38.mlp.shared_experts.gate_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▏                   | 613/751 [02:17<00:49,  2.76it/s, Materializing param=model.layers.38.mlp.shared_experts.gate_proj.weight]Loading weights:  82%|█████████████████████████████████████████████████████████████████████████████████████████▉                    | 614/751 [02:17<00:49,  2.76it/s, Materializing param=model.layers.38.mlp.shared_experts.up_proj.weight]Loading weights:  82%|█████████████████████████████████████████████████████████████████████████████████████████▉                    | 614/751 [02:17<00:49,  2.76it/s, Materializing param=model.layers.38.mlp.shared_experts.up_proj.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▋                    | 615/751 [02:17<00:49,  2.76it/s, Materializing param=model.layers.38.post_attention_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▋                    | 615/751 [02:17<00:49,  2.76it/s, Materializing param=model.layers.38.post_attention_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▊                    | 616/751 [02:17<00:48,  2.76it/s, Materializing param=model.layers.38.self_attn.kv_a_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████▊                    | 616/751 [02:17<00:48,  2.76it/s, Materializing param=model.layers.38.self_attn.kv_a_layernorm.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▋                   | 617/751 [02:17<00:48,  2.76it/s, Materializing param=model.layers.38.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████▋                   | 617/751 [02:17<00:48,  2.76it/s, Materializing param=model.layers.38.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 618/751 [02:17<00:48,  2.76it/s, Materializing param=model.layers.38.self_attn.kv_b_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 618/751 [02:17<00:48,  2.76it/s, Materializing param=model.layers.38.self_attn.kv_b_proj.weight]Loading weights:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 619/751 [02:17<00:47,  2.76it/s, Materializing param=model.layers.38.self_attn.o_proj.weight]Loading weights:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 619/751 [02:17<00:47,  2.76it/s, Materializing param=model.layers.38.self_attn.o_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                   | 620/751 [02:17<00:47,  2.76it/s, Materializing param=model.layers.38.self_attn.q_a_layernorm.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                   | 620/751 [02:17<00:47,  2.76it/s, Materializing param=model.layers.38.self_attn.q_a_layernorm.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 621/751 [02:17<00:47,  2.76it/s, Materializing param=model.layers.38.self_attn.q_a_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 621/751 [02:17<00:47,  2.76it/s, Materializing param=model.layers.38.self_attn.q_a_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 622/751 [02:17<00:46,  2.76it/s, Materializing param=model.layers.38.self_attn.q_b_proj.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 622/751 [02:17<00:46,  2.76it/s, Materializing param=model.layers.38.self_attn.q_b_proj.weight]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 623/751 [02:17<00:46,  2.76it/s, Materializing param=model.layers.39.input_layernorm.weight]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 623/751 [02:17<00:46,  2.76it/s, Materializing param=model.layers.39.input_layernorm.weight]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 624/751 [02:17<00:45,  2.76it/s, Materializing param=model.layers.39.mlp.experts.down_proj]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 624/751 [02:17<00:45,  2.76it/s, Materializing param=model.layers.39.mlp.experts.down_proj]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 625/751 [02:19<00:29,  4.30it/s, Materializing param=model.layers.39.mlp.experts.down_proj]Loading weights:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 625/751 [02:18<00:29,  4.30it/s, Materializing param=model.layers.39.mlp.experts.down_proj]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████                    | 625/751 [02:19<00:29,  4.30it/s, Materializing param=model.layers.39.mlp.experts.gate_up_proj]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████                    | 625/751 [02:18<00:29,  4.30it/s, Materializing param=model.layers.39.mlp.experts.gate_up_proj]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████                    | 625/751 [02:19<00:29,  4.30it/s, Materializing param=model.layers.39.mlp.experts.gate_up_proj]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████                    | 625/751 [02:18<00:29,  4.30it/s, Materializing param=model.layers.39.mlp.experts.gate_up_proj]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 626/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.experts.gate_up_proj]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████▌                  | 626/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.e_score_correction_bias]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████▌                  | 626/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.e_score_correction_bias]Loading weights:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 627/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.weight]Loading weights:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 627/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▎                 | 628/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.down_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▎                 | 628/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.down_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▍                 | 629/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.gate_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▍                 | 629/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.gate_proj.weight]Loading weights:  84%|████████████████████████████████████████████████████████████████████████████████████████████▎                 | 630/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.up_proj.weight]Loading weights:  84%|████████████████████████████████████████████████████████████████████████████████████████████▎                 | 630/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.up_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████                  | 631/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.post_attention_layernorm.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████                  | 631/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.post_attention_layernorm.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████▎                 | 632/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_layernorm.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████▎                 | 632/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_layernorm.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████                 | 633/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████                 | 633/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 634/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_b_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 634/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_b_proj.weight]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 635/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.o_proj.weight]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 635/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.o_proj.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████▋                 | 636/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_layernorm.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████▋                 | 636/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_layernorm.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████                  | 637/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████                  | 637/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 638/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_b_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 638/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_b_proj.weight]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 639/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.input_layernorm.weight]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 639/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.input_layernorm.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 640/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.mlp.experts.down_proj]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 640/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.mlp.experts.down_proj]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 626/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.experts.gate_up_proj]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████▌                  | 626/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.e_score_correction_bias]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████▌                  | 626/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.e_score_correction_bias]Loading weights:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 627/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.weight]Loading weights:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 627/751 [02:22<00:45,  2.74it/s, Materializing param=model.layers.39.mlp.gate.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▎                 | 628/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.down_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▎                 | 628/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.down_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▍                 | 629/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.gate_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████▍                 | 629/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.gate_proj.weight]Loading weights:  84%|████████████████████████████████████████████████████████████████████████████████████████████▎                 | 630/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.up_proj.weight]Loading weights:  84%|████████████████████████████████████████████████████████████████████████████████████████████▎                 | 630/751 [02:22<00:44,  2.74it/s, Materializing param=model.layers.39.mlp.shared_experts.up_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████                  | 631/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.post_attention_layernorm.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████                  | 631/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.post_attention_layernorm.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████▎                 | 632/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_layernorm.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████▎                 | 632/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_layernorm.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████                 | 633/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████                 | 633/751 [02:22<00:43,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 634/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_b_proj.weight]Loading weights:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 634/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.kv_b_proj.weight]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 635/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.o_proj.weight]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 635/751 [02:22<00:42,  2.74it/s, Materializing param=model.layers.39.self_attn.o_proj.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████▋                 | 636/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_layernorm.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████▋                 | 636/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_layernorm.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████                  | 637/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████                  | 637/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_a_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 638/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_b_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 638/751 [02:22<00:41,  2.74it/s, Materializing param=model.layers.39.self_attn.q_b_proj.weight]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 639/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.input_layernorm.weight]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 639/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.input_layernorm.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 640/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.mlp.experts.down_proj]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 640/751 [02:22<00:40,  2.74it/s, Materializing param=model.layers.40.mlp.experts.down_proj]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 641/751 [02:23<00:25,  4.27it/s, Materializing param=model.layers.40.mlp.experts.down_proj]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 641/751 [02:23<00:25,  4.27it/s, Materializing param=model.layers.40.mlp.experts.down_proj]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 641/751 [02:23<00:25,  4.27it/s, Materializing param=model.layers.40.mlp.experts.gate_up_proj]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 641/751 [02:23<00:25,  4.27it/s, Materializing param=model.layers.40.mlp.experts.gate_up_proj]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 641/751 [02:23<00:25,  4.27it/s, Materializing param=model.layers.40.mlp.experts.gate_up_proj]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 641/751 [02:23<00:25,  4.27it/s, Materializing param=model.layers.40.mlp.experts.gate_up_proj]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 642/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.experts.gate_up_proj]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████▉                | 642/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.gate.e_score_correction_bias]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████▉                | 642/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.gate.e_score_correction_bias]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 643/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.gate.weight]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 643/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.gate.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▌               | 644/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.shared_experts.down_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▌               | 644/751 [02:26<00:39,  2.74it/s, Materializing param=model.layers.40.mlp.shared_experts.down_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▊               | 645/751 [02:26<00:38,  2.74it/s, Materializing param=model.layers.40.mlp.shared_experts.gate_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▊               | 645/751 [02:26<00:38,  2.74it/s, Materializing param=model.layers.40.mlp.shared_experts.gate_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████▌               | 646/751 [02:26<00:38,  2.74it/s, Materializing param=model.layers.40.mlp.shared_experts.up_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████▌               | 646/751 [02:26<00:38,  2.74it/s, Materializing param=model.layers.40.mlp.shared_experts.up_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▍               | 647/751 [02:26<00:38,  2.74it/s, Materializing param=model.layers.40.post_attention_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▍               | 647/751 [02:26<00:38,  2.74it/s, Materializing param=model.layers.40.post_attention_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▋               | 648/751 [02:26<00:37,  2.74it/s, Materializing param=model.layers.40.self_attn.kv_a_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▋               | 648/751 [02:26<00:37,  2.74it/s, Materializing param=model.layers.40.self_attn.kv_a_layernorm.weight]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████▎              | 649/751 [02:26<00:37,  2.74it/s, Materializing param=model.layers.40.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████▎              | 649/751 [02:26<00:37,  2.74it/s, Materializing param=model.layers.40.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 650/751 [02:26<00:36,  2.74it/s, Materializing param=model.layers.40.self_attn.kv_b_proj.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 650/751 [02:26<00:36,  2.74it/s, Materializing param=model.layers.40.self_attn.kv_b_proj.weight]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████                | 651/751 [02:26<00:36,  2.74it/s, Materializing param=model.layers.40.self_attn.o_proj.weight]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████                | 651/751 [02:26<00:36,  2.74it/s, Materializing param=model.layers.40.self_attn.o_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████               | 652/751 [02:26<00:36,  2.74it/s, Materializing param=model.layers.40.self_attn.q_a_layernorm.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████               | 652/751 [02:26<00:36,  2.74it/s, Materializing param=model.layers.40.self_attn.q_a_layernorm.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 653/751 [02:26<00:35,  2.74it/s, Materializing param=model.layers.40.self_attn.q_a_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 653/751 [02:26<00:35,  2.74it/s, Materializing param=model.layers.40.self_attn.q_a_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 654/751 [02:26<00:35,  2.74it/s, Materializing param=model.layers.40.self_attn.q_b_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 654/751 [02:26<00:35,  2.74it/s, Materializing param=model.layers.40.self_attn.q_b_proj.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 655/751 [02:26<00:35,  2.74it/s, Materializing param=model.layers.41.input_layernorm.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 655/751 [02:26<00:35,  2.74it/s, Materializing param=model.layers.41.input_layernorm.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 656/751 [02:26<00:34,  2.74it/s, Materializing param=model.layers.41.mlp.experts.down_proj]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 656/751 [02:26<00:34,  2.74it/s, Materializing param=model.layers.41.mlp.experts.down_proj]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 642/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.experts.gate_up_proj]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████▉                | 642/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.gate.e_score_correction_bias]Loading weights:  85%|██████████████████████████████████████████████████████████████████████████████████████████████▉                | 642/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.gate.e_score_correction_bias]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 643/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.gate.weight]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 643/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.gate.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▌               | 644/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.shared_experts.down_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▌               | 644/751 [02:27<00:39,  2.73it/s, Materializing param=model.layers.40.mlp.shared_experts.down_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▊               | 645/751 [02:27<00:38,  2.73it/s, Materializing param=model.layers.40.mlp.shared_experts.gate_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████▊               | 645/751 [02:27<00:38,  2.73it/s, Materializing param=model.layers.40.mlp.shared_experts.gate_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████▌               | 646/751 [02:27<00:38,  2.73it/s, Materializing param=model.layers.40.mlp.shared_experts.up_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████▌               | 646/751 [02:27<00:38,  2.73it/s, Materializing param=model.layers.40.mlp.shared_experts.up_proj.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▍               | 647/751 [02:27<00:38,  2.73it/s, Materializing param=model.layers.40.post_attention_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▍               | 647/751 [02:27<00:38,  2.73it/s, Materializing param=model.layers.40.post_attention_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▋               | 648/751 [02:27<00:37,  2.73it/s, Materializing param=model.layers.40.self_attn.kv_a_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▋               | 648/751 [02:27<00:37,  2.73it/s, Materializing param=model.layers.40.self_attn.kv_a_layernorm.weight]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████▎              | 649/751 [02:27<00:37,  2.73it/s, Materializing param=model.layers.40.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  86%|█████████████████████████████████████████████████████████████████████████████████████████████▎              | 649/751 [02:27<00:37,  2.73it/s, Materializing param=model.layers.40.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 650/751 [02:27<00:36,  2.73it/s, Materializing param=model.layers.40.self_attn.kv_b_proj.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 650/751 [02:27<00:36,  2.73it/s, Materializing param=model.layers.40.self_attn.kv_b_proj.weight]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████                | 651/751 [02:27<00:36,  2.73it/s, Materializing param=model.layers.40.self_attn.o_proj.weight]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████                | 651/751 [02:27<00:36,  2.73it/s, Materializing param=model.layers.40.self_attn.o_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████               | 652/751 [02:27<00:36,  2.73it/s, Materializing param=model.layers.40.self_attn.q_a_layernorm.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████               | 652/751 [02:27<00:36,  2.73it/s, Materializing param=model.layers.40.self_attn.q_a_layernorm.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 653/751 [02:27<00:35,  2.73it/s, Materializing param=model.layers.40.self_attn.q_a_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 653/751 [02:27<00:35,  2.73it/s, Materializing param=model.layers.40.self_attn.q_a_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 654/751 [02:27<00:35,  2.73it/s, Materializing param=model.layers.40.self_attn.q_b_proj.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 654/751 [02:27<00:35,  2.73it/s, Materializing param=model.layers.40.self_attn.q_b_proj.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 655/751 [02:27<00:35,  2.73it/s, Materializing param=model.layers.41.input_layernorm.weight]Loading weights:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 655/751 [02:27<00:35,  2.73it/s, Materializing param=model.layers.41.input_layernorm.weight]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 656/751 [02:27<00:34,  2.73it/s, Materializing param=model.layers.41.mlp.experts.down_proj]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 656/751 [02:27<00:34,  2.73it/s, Materializing param=model.layers.41.mlp.experts.down_proj]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 657/751 [02:28<00:22,  4.23it/s, Materializing param=model.layers.41.mlp.experts.down_proj]Loading weights:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 657/751 [02:28<00:22,  4.23it/s, Materializing param=model.layers.41.mlp.experts.down_proj]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████               | 657/751 [02:28<00:22,  4.23it/s, Materializing param=model.layers.41.mlp.experts.gate_up_proj]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████               | 657/751 [02:28<00:22,  4.23it/s, Materializing param=model.layers.41.mlp.experts.gate_up_proj]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████               | 657/751 [02:28<00:22,  4.23it/s, Materializing param=model.layers.41.mlp.experts.gate_up_proj]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████               | 657/751 [02:28<00:22,  4.23it/s, Materializing param=model.layers.41.mlp.experts.gate_up_proj]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 658/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.experts.gate_up_proj]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████▎             | 658/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.gate.e_score_correction_bias]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████▎             | 658/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.gate.e_score_correction_bias]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 659/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.gate.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 659/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.gate.weight]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████▉             | 660/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.shared_experts.down_proj.weight]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████▉             | 660/751 [02:31<00:33,  2.75it/s, Materializing param=model.layers.41.mlp.shared_experts.down_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████             | 661/751 [02:31<00:32,  2.75it/s, Materializing param=model.layers.41.mlp.shared_experts.gate_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████             | 661/751 [02:31<00:32,  2.75it/s, Materializing param=model.layers.41.mlp.shared_experts.gate_proj.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████▉             | 662/751 [02:31<00:32,  2.75it/s, Materializing param=model.layers.41.mlp.shared_experts.up_proj.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 658/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.experts.gate_up_proj]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████▉             | 662/751 [02:31<00:32,  2.75it/s, Materializing param=model.layers.41.mlp.shared_experts.up_proj.weight]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████▎             | 658/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.gate.e_score_correction_bias]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████▉             | 663/751 [02:31<00:32,  2.75it/s, Materializing param=model.layers.41.post_attention_layernorm.weight]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████▎             | 658/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.gate.e_score_correction_bias]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████▉             | 663/751 [02:31<00:32,  2.75it/s, Materializing param=model.layers.41.post_attention_layernorm.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 659/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.gate.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 664/751 [02:31<00:31,  2.75it/s, Materializing param=model.layers.41.self_attn.kv_a_layernorm.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 659/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.gate.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 664/751 [02:31<00:31,  2.75it/s, Materializing param=model.layers.41.self_attn.kv_a_layernorm.weight]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████▉             | 660/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.shared_experts.down_proj.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████▋            | 665/751 [02:31<00:31,  2.75it/s, Materializing param=model.layers.41.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████▉             | 660/751 [02:31<00:33,  2.74it/s, Materializing param=model.layers.41.mlp.shared_experts.down_proj.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████▋            | 665/751 [02:31<00:31,  2.75it/s, Materializing param=model.layers.41.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 666/751 [02:31<00:30,  2.75it/s, Materializing param=model.layers.41.self_attn.kv_b_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████             | 661/751 [02:31<00:32,  2.74it/s, Materializing param=model.layers.41.mlp.shared_experts.gate_proj.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 666/751 [02:31<00:30,  2.75it/s, Materializing param=model.layers.41.self_attn.kv_b_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████             | 661/751 [02:31<00:32,  2.74it/s, Materializing param=model.layers.41.mlp.shared_experts.gate_proj.weight]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 667/751 [02:31<00:30,  2.75it/s, Materializing param=model.layers.41.self_attn.o_proj.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████▉             | 662/751 [02:31<00:32,  2.74it/s, Materializing param=model.layers.41.mlp.shared_experts.up_proj.weight]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 667/751 [02:31<00:30,  2.75it/s, Materializing param=model.layers.41.self_attn.o_proj.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████▉             | 662/751 [02:31<00:32,  2.74it/s, Materializing param=model.layers.41.mlp.shared_experts.up_proj.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 668/751 [02:31<00:30,  2.75it/s, Materializing param=model.layers.41.self_attn.q_a_layernorm.weight]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████▉             | 663/751 [02:31<00:32,  2.74it/s, Materializing param=model.layers.41.post_attention_layernorm.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 668/751 [02:31<00:30,  2.75it/s, Materializing param=model.layers.41.self_attn.q_a_layernorm.weight]Loading weights:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████▉             | 663/751 [02:31<00:32,  2.74it/s, Materializing param=model.layers.41.post_attention_layernorm.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████             | 669/751 [02:31<00:29,  2.75it/s, Materializing param=model.layers.41.self_attn.q_a_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 664/751 [02:31<00:31,  2.74it/s, Materializing param=model.layers.41.self_attn.kv_a_layernorm.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████             | 669/751 [02:31<00:29,  2.75it/s, Materializing param=model.layers.41.self_attn.q_a_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████             | 664/751 [02:31<00:31,  2.74it/s, Materializing param=model.layers.41.self_attn.kv_a_layernorm.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 670/751 [02:31<00:29,  2.75it/s, Materializing param=model.layers.41.self_attn.q_b_proj.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████▋            | 665/751 [02:31<00:31,  2.74it/s, Materializing param=model.layers.41.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 670/751 [02:31<00:29,  2.75it/s, Materializing param=model.layers.41.self_attn.q_b_proj.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████▋            | 665/751 [02:31<00:31,  2.74it/s, Materializing param=model.layers.41.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 671/751 [02:31<00:29,  2.75it/s, Materializing param=model.layers.42.input_layernorm.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 666/751 [02:31<00:30,  2.74it/s, Materializing param=model.layers.41.self_attn.kv_b_proj.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 671/751 [02:31<00:29,  2.75it/s, Materializing param=model.layers.42.input_layernorm.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 666/751 [02:31<00:30,  2.74it/s, Materializing param=model.layers.41.self_attn.kv_b_proj.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 672/751 [02:31<00:28,  2.75it/s, Materializing param=model.layers.42.mlp.experts.down_proj]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 667/751 [02:31<00:30,  2.74it/s, Materializing param=model.layers.41.self_attn.o_proj.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 672/751 [02:31<00:28,  2.75it/s, Materializing param=model.layers.42.mlp.experts.down_proj]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 667/751 [02:31<00:30,  2.74it/s, Materializing param=model.layers.41.self_attn.o_proj.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 668/751 [02:31<00:30,  2.74it/s, Materializing param=model.layers.41.self_attn.q_a_layernorm.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 668/751 [02:31<00:30,  2.74it/s, Materializing param=model.layers.41.self_attn.q_a_layernorm.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████             | 669/751 [02:31<00:29,  2.74it/s, Materializing param=model.layers.41.self_attn.q_a_proj.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████             | 669/751 [02:31<00:29,  2.74it/s, Materializing param=model.layers.41.self_attn.q_a_proj.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 670/751 [02:31<00:29,  2.74it/s, Materializing param=model.layers.41.self_attn.q_b_proj.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 670/751 [02:31<00:29,  2.74it/s, Materializing param=model.layers.41.self_attn.q_b_proj.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 671/751 [02:31<00:29,  2.74it/s, Materializing param=model.layers.42.input_layernorm.weight]Loading weights:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 671/751 [02:31<00:29,  2.74it/s, Materializing param=model.layers.42.input_layernorm.weight]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 672/751 [02:31<00:28,  2.74it/s, Materializing param=model.layers.42.mlp.experts.down_proj]Loading weights:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 672/751 [02:31<00:28,  2.74it/s, Materializing param=model.layers.42.mlp.experts.down_proj]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 673/751 [02:33<00:18,  4.29it/s, Materializing param=model.layers.42.mlp.experts.down_proj]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 673/751 [02:32<00:18,  4.28it/s, Materializing param=model.layers.42.mlp.experts.down_proj]Loading weights:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 673/751 [02:33<00:18,  4.29it/s, Materializing param=model.layers.42.mlp.experts.gate_up_proj]Loading weights:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 673/751 [02:32<00:18,  4.28it/s, Materializing param=model.layers.42.mlp.experts.gate_up_proj]Loading weights:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 673/751 [02:33<00:18,  4.29it/s, Materializing param=model.layers.42.mlp.experts.gate_up_proj]Loading weights:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 673/751 [02:32<00:18,  4.28it/s, Materializing param=model.layers.42.mlp.experts.gate_up_proj]Loading weights:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 674/751 [02:36<00:28,  2.72it/s, Materializing param=model.layers.42.mlp.experts.gate_up_proj]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▌           | 674/751 [02:36<00:28,  2.72it/s, Materializing param=model.layers.42.mlp.gate.e_score_correction_bias]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▌           | 674/751 [02:36<00:28,  2.72it/s, Materializing param=model.layers.42.mlp.gate.e_score_correction_bias]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 675/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.gate.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 675/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.gate.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 676/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.down_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 676/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.down_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▎          | 677/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.gate_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▎          | 677/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.gate_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▎          | 678/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.up_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▎          | 678/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.up_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 679/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.post_attention_layernorm.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 679/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.post_attention_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 680/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 680/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▉          | 681/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▉          | 681/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 682/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_b_proj.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 682/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_b_proj.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 683/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.o_proj.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 683/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.o_proj.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 684/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_layernorm.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 684/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_layernorm.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 685/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 685/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 686/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.42.self_attn.q_b_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 686/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.42.self_attn.q_b_proj.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 687/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.input_layernorm.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 687/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.input_layernorm.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 688/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.mlp.experts.down_proj]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 688/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.mlp.experts.down_proj]Loading weights:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 674/751 [02:36<00:28,  2.72it/s, Materializing param=model.layers.42.mlp.experts.gate_up_proj]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▌           | 674/751 [02:36<00:28,  2.72it/s, Materializing param=model.layers.42.mlp.gate.e_score_correction_bias]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▌           | 674/751 [02:36<00:28,  2.72it/s, Materializing param=model.layers.42.mlp.gate.e_score_correction_bias]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 675/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.gate.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 675/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.gate.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 676/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.down_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 676/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.down_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▎          | 677/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.gate_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▎          | 677/751 [02:36<00:27,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.gate_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▎          | 678/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.up_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████▎          | 678/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.mlp.shared_experts.up_proj.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 679/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.post_attention_layernorm.weight]Loading weights:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 679/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.post_attention_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 680/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 680/751 [02:36<00:26,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▉          | 681/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▉          | 681/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 682/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_b_proj.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 682/751 [02:36<00:25,  2.72it/s, Materializing param=model.layers.42.self_attn.kv_b_proj.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 683/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.o_proj.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 683/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.o_proj.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 684/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_layernorm.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 684/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_layernorm.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 685/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 685/751 [02:36<00:24,  2.72it/s, Materializing param=model.layers.42.self_attn.q_a_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 686/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.42.self_attn.q_b_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 686/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.42.self_attn.q_b_proj.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 687/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.input_layernorm.weight]Loading weights:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 687/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.input_layernorm.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 688/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.mlp.experts.down_proj]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 688/751 [02:36<00:23,  2.72it/s, Materializing param=model.layers.43.mlp.experts.down_proj]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 689/751 [02:37<00:14,  4.24it/s, Materializing param=model.layers.43.mlp.experts.down_proj]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 689/751 [02:37<00:14,  4.24it/s, Materializing param=model.layers.43.mlp.experts.down_proj]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 689/751 [02:37<00:14,  4.24it/s, Materializing param=model.layers.43.mlp.experts.gate_up_proj]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 689/751 [02:37<00:14,  4.24it/s, Materializing param=model.layers.43.mlp.experts.gate_up_proj]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 689/751 [02:37<00:14,  4.24it/s, Materializing param=model.layers.43.mlp.experts.gate_up_proj]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 689/751 [02:37<00:14,  4.24it/s, Materializing param=model.layers.43.mlp.experts.gate_up_proj]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 690/751 [02:41<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.experts.gate_up_proj]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 690/751 [02:41<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.e_score_correction_bias]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 690/751 [02:41<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.e_score_correction_bias]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 691/751 [02:41<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 691/751 [02:41<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▌        | 692/751 [02:41<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.down_proj.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▌        | 692/751 [02:41<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.down_proj.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▋        | 693/751 [02:41<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.gate_proj.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 690/751 [02:40<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.experts.gate_up_proj]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▋        | 693/751 [02:41<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.gate_proj.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 690/751 [02:40<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.e_score_correction_bias]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 694/751 [02:41<00:20,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.up_proj.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 690/751 [02:40<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.e_score_correction_bias]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 694/751 [02:41<00:20,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.up_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 695/751 [02:41<00:20,  2.72it/s, Materializing param=model.layers.43.post_attention_layernorm.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 691/751 [02:40<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 695/751 [02:41<00:20,  2.72it/s, Materializing param=model.layers.43.post_attention_layernorm.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 691/751 [02:40<00:22,  2.72it/s, Materializing param=model.layers.43.mlp.gate.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 696/751 [02:41<00:20,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_layernorm.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 696/751 [02:41<00:20,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_layernorm.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▌        | 692/751 [02:40<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.down_proj.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▌        | 692/751 [02:40<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.down_proj.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 697/751 [02:41<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 697/751 [02:41<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▋        | 693/751 [02:40<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.gate_proj.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 698/751 [02:41<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_b_proj.weight]Loading weights:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████▋        | 693/751 [02:40<00:21,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.gate_proj.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 698/751 [02:41<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_b_proj.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 694/751 [02:40<00:20,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.up_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 699/751 [02:41<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.o_proj.weight]Loading weights:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 694/751 [02:40<00:20,  2.72it/s, Materializing param=model.layers.43.mlp.shared_experts.up_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 699/751 [02:41<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.o_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 695/751 [02:40<00:20,  2.72it/s, Materializing param=model.layers.43.post_attention_layernorm.weight]Loading weights:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 700/751 [02:41<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_layernorm.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 695/751 [02:40<00:20,  2.72it/s, Materializing param=model.layers.43.post_attention_layernorm.weight]Loading weights:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 700/751 [02:41<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_layernorm.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 696/751 [02:40<00:20,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_layernorm.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 701/751 [02:41<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 696/751 [02:40<00:20,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_layernorm.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 701/751 [02:41<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_proj.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 702/751 [02:41<00:17,  2.72it/s, Materializing param=model.layers.43.self_attn.q_b_proj.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 697/751 [02:40<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 702/751 [02:41<00:17,  2.72it/s, Materializing param=model.layers.43.self_attn.q_b_proj.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 697/751 [02:40<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 703/751 [02:41<00:17,  2.72it/s, Materializing param=model.layers.44.input_layernorm.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 698/751 [02:40<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_b_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 703/751 [02:41<00:17,  2.72it/s, Materializing param=model.layers.44.input_layernorm.weight]Loading weights:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 698/751 [02:40<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.kv_b_proj.weight]Loading weights:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 704/751 [02:41<00:17,  2.72it/s, Materializing param=model.layers.44.mlp.experts.down_proj]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 699/751 [02:40<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.o_proj.weight]Loading weights:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 704/751 [02:41<00:17,  2.72it/s, Materializing param=model.layers.44.mlp.experts.down_proj]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 699/751 [02:40<00:19,  2.72it/s, Materializing param=model.layers.43.self_attn.o_proj.weight]Loading weights:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 700/751 [02:40<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_layernorm.weight]Loading weights:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 700/751 [02:40<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_layernorm.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 701/751 [02:40<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_proj.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 701/751 [02:40<00:18,  2.72it/s, Materializing param=model.layers.43.self_attn.q_a_proj.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 702/751 [02:40<00:17,  2.72it/s, Materializing param=model.layers.43.self_attn.q_b_proj.weight]Loading weights:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 702/751 [02:40<00:17,  2.72it/s, Materializing param=model.layers.43.self_attn.q_b_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 703/751 [02:40<00:17,  2.72it/s, Materializing param=model.layers.44.input_layernorm.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 703/751 [02:40<00:17,  2.72it/s, Materializing param=model.layers.44.input_layernorm.weight]Loading weights:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 704/751 [02:40<00:17,  2.72it/s, Materializing param=model.layers.44.mlp.experts.down_proj]Loading weights:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 704/751 [02:40<00:17,  2.72it/s, Materializing param=model.layers.44.mlp.experts.down_proj]Loading weights:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 705/751 [02:42<00:10,  4.21it/s, Materializing param=model.layers.44.mlp.experts.down_proj]Loading weights:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 705/751 [02:42<00:10,  4.21it/s, Materializing param=model.layers.44.mlp.experts.down_proj]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 705/751 [02:42<00:10,  4.21it/s, Materializing param=model.layers.44.mlp.experts.gate_up_proj]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 705/751 [02:42<00:10,  4.21it/s, Materializing param=model.layers.44.mlp.experts.gate_up_proj]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 705/751 [02:42<00:10,  4.21it/s, Materializing param=model.layers.44.mlp.experts.gate_up_proj]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 705/751 [02:42<00:10,  4.21it/s, Materializing param=model.layers.44.mlp.experts.gate_up_proj]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 706/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.experts.gate_up_proj]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 706/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.e_score_correction_bias]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 706/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.e_score_correction_bias]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 707/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.weight]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 707/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 708/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.down_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 708/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.down_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 709/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.gate_proj.weight]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 706/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.experts.gate_up_proj]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 709/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.gate_proj.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 710/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.up_proj.weight]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 706/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.e_score_correction_bias]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 710/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.up_proj.weight]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 706/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.e_score_correction_bias]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████      | 711/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.post_attention_layernorm.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████      | 711/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.post_attention_layernorm.weight]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 707/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 712/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_layernorm.weight]Loading weights:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 707/751 [02:45<00:16,  2.69it/s, Materializing param=model.layers.44.mlp.gate.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 712/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_layernorm.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 708/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.down_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 713/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 708/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.down_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 713/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 714/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_b_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 709/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.gate_proj.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 714/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_b_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 709/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.gate_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 715/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.o_proj.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 710/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.up_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 715/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.o_proj.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 710/751 [02:45<00:15,  2.69it/s, Materializing param=model.layers.44.mlp.shared_experts.up_proj.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 716/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_layernorm.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████      | 711/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.post_attention_layernorm.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 716/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_layernorm.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████      | 711/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.post_attention_layernorm.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 717/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_proj.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 717/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 712/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_layernorm.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 712/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_layernorm.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 718/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_b_proj.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 718/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_b_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 713/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 719/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.input_layernorm.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 713/751 [02:45<00:14,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 719/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.input_layernorm.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 714/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_b_proj.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 720/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.mlp.experts.down_proj]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 714/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.kv_b_proj.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 720/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.mlp.experts.down_proj]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 715/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.o_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 715/751 [02:45<00:13,  2.69it/s, Materializing param=model.layers.44.self_attn.o_proj.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 716/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_layernorm.weight]Loading weights:  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 716/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_layernorm.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 717/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_proj.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 717/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_a_proj.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 718/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_b_proj.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 718/751 [02:45<00:12,  2.69it/s, Materializing param=model.layers.44.self_attn.q_b_proj.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 719/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.input_layernorm.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 719/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.input_layernorm.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 720/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.mlp.experts.down_proj]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 720/751 [02:45<00:11,  2.69it/s, Materializing param=model.layers.45.mlp.experts.down_proj]Loading weights:  96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 721/751 [02:47<00:07,  4.20it/s, Materializing param=model.layers.45.mlp.experts.down_proj]Loading weights:  96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 721/751 [02:47<00:07,  4.20it/s, Materializing param=model.layers.45.mlp.experts.down_proj]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 721/751 [02:47<00:07,  4.20it/s, Materializing param=model.layers.45.mlp.experts.gate_up_proj]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 721/751 [02:47<00:07,  4.20it/s, Materializing param=model.layers.45.mlp.experts.gate_up_proj]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 721/751 [02:47<00:07,  4.20it/s, Materializing param=model.layers.45.mlp.experts.gate_up_proj]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 721/751 [02:47<00:07,  4.20it/s, Materializing param=model.layers.45.mlp.experts.gate_up_proj]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 722/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.experts.gate_up_proj]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 722/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.e_score_correction_bias]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 722/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.e_score_correction_bias]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 723/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 723/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████    | 724/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.down_proj.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████    | 724/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.down_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 725/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.gate_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 725/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.gate_proj.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 722/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.experts.gate_up_proj]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 726/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.up_proj.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 722/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.e_score_correction_bias]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 726/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.up_proj.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 722/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.e_score_correction_bias]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 727/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.post_attention_layernorm.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 727/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.post_attention_layernorm.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 723/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 728/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_layernorm.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 723/751 [02:50<00:10,  2.70it/s, Materializing param=model.layers.45.mlp.gate.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 728/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_layernorm.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████    | 724/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.down_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 729/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 729/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████    | 724/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.down_proj.weight]Loading weights:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 730/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_b_proj.weight]Loading weights:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 730/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_b_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 725/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.gate_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 731/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.o_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 725/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.gate_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 731/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.o_proj.weight]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 732/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_layernorm.weight]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 726/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.up_proj.weight]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 732/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_layernorm.weight]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 726/751 [02:50<00:09,  2.70it/s, Materializing param=model.layers.45.mlp.shared_experts.up_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 733/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 733/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 727/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.post_attention_layernorm.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 727/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.post_attention_layernorm.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 734/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_b_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 734/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_b_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 728/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_layernorm.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 735/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.input_layernorm.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 735/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.input_layernorm.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 736/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.mlp.experts.down_proj]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 728/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_layernorm.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 736/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.mlp.experts.down_proj]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 729/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 729/751 [02:50<00:08,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 730/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_b_proj.weight]Loading weights:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 730/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.kv_b_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 731/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.o_proj.weight]Loading weights:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 731/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.o_proj.weight]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 732/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_layernorm.weight]Loading weights:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 732/751 [02:50<00:07,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_layernorm.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 733/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 733/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_a_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 734/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_b_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 734/751 [02:50<00:06,  2.70it/s, Materializing param=model.layers.45.self_attn.q_b_proj.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 735/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.input_layernorm.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 735/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.input_layernorm.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 736/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.mlp.experts.down_proj]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 736/751 [02:50<00:05,  2.70it/s, Materializing param=model.layers.46.mlp.experts.down_proj]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 737/751 [02:52<00:03,  4.17it/s, Materializing param=model.layers.46.mlp.experts.down_proj]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 737/751 [02:51<00:03,  4.17it/s, Materializing param=model.layers.46.mlp.experts.down_proj]Loading weights:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 737/751 [02:52<00:03,  4.17it/s, Materializing param=model.layers.46.mlp.experts.gate_up_proj]Loading weights:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 737/751 [02:51<00:03,  4.17it/s, Materializing param=model.layers.46.mlp.experts.gate_up_proj]Loading weights:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 737/751 [02:52<00:03,  4.17it/s, Materializing param=model.layers.46.mlp.experts.gate_up_proj]Loading weights:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 737/751 [02:51<00:03,  4.17it/s, Materializing param=model.layers.46.mlp.experts.gate_up_proj]Loading weights:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 738/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.experts.gate_up_proj]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 738/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.e_score_correction_bias]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 738/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.e_score_correction_bias]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 739/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.weight]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 739/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 740/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.down_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 740/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.down_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 741/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.gate_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 741/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.gate_proj.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 742/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.up_proj.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 742/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.up_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 743/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.post_attention_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 743/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.post_attention_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 744/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 744/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_layernorm.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 745/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 745/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 746/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_b_proj.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 746/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_b_proj.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 747/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.o_proj.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 747/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.o_proj.weight]Loading weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 748/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_layernorm.weight]Loading weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 748/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_layernorm.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 749/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_proj.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 749/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_proj.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 750/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_b_proj.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 750/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_b_proj.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [02:55<00:00,  2.70it/s, Materializing param=model.norm.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [02:55<00:00,  2.70it/s, Materializing param=model.norm.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [02:55<00:00,  4.28it/s, Materializing param=model.norm.weight]Loading weights:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 738/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.experts.gate_up_proj]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 738/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.e_score_correction_bias]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 738/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.e_score_correction_bias]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 739/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.weight]Loading weights:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 739/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.gate.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 740/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.down_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 740/751 [02:55<00:04,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.down_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 741/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.gate_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 741/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.gate_proj.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 742/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.up_proj.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 742/751 [02:55<00:03,  2.70it/s, Materializing param=model.layers.46.mlp.shared_experts.up_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 743/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.post_attention_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 743/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.post_attention_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 744/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 744/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_layernorm.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 745/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 745/751 [02:55<00:02,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_a_proj_with_mqa.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 746/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_b_proj.weight]Loading weights:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 746/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.kv_b_proj.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 747/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.o_proj.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 747/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.o_proj.weight]Loading weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 748/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_layernorm.weight]Loading weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 748/751 [02:55<00:01,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_layernorm.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 749/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_proj.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 749/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_a_proj.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 750/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_b_proj.weight]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 750/751 [02:55<00:00,  2.70it/s, Materializing param=model.layers.46.self_attn.q_b_proj.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [02:55<00:00,  2.70it/s, Materializing param=model.norm.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [02:55<00:00,  2.70it/s, Materializing param=model.norm.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 751/751 [02:55<00:00,  4.29it/s, Materializing param=model.norm.weight]
[2026-01-26 21:48:43,857] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:346] [PID:58141] Converting modules to torch.bfloat16
[2026-01-26 21:48:43,860] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:58141] Memory usage after model load 0.000GB ()
[2026-01-26 21:48:43,867] [WARNING] [torchao.<module>:39] [PID:58141] Skipping import of cpp extensions due to incompatible torch version 2.9.1+cu130 for torchao version 0.13.0

[2026-01-26 21:48:43,934] [WARNING] [torchao.<module>:39] [PID:58142] Skipping import of cpp extensions due to incompatible torch version 2.9.1+cu130 for torchao version 0.13.0
trainable params: 20,840,448 || all params: 29,964,231,424 || trainable%: 0.0696
[2026-01-26 21:48:44,155] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:58141] after adapters 0.000GB ()
[2026-01-26 21:48:48,350] [WARNING] [accelerate.utils.dataclasses.__post_init__:1962] [PID:58142] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.
[2026-01-26 21:48:48,350] [WARNING] [accelerate.utils.dataclasses.__post_init__:1962] [PID:58141] sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.
[2026-01-26 21:48:48,893] [INFO] [axolotl.train.save_initial_configs:413] [PID:58141] Pre-saving adapter config to ./outputs/qlora-out...
[2026-01-26 21:48:48,894] [INFO] [axolotl.train.save_initial_configs:417] [PID:58141] Pre-saving tokenizer to ./outputs/qlora-out...
[2026-01-26 21:48:49,037] [INFO] [axolotl.train.save_initial_configs:422] [PID:58141] Pre-saving model config to ./outputs/qlora-out...
[2026-01-26 21:48:49,048] [INFO] [axolotl.train.execute_training:212] [PID:58141] Starting trainer...
  0%|                                                                                                                                                                                                               | 0/5680 [00:00<?, ?it/s][2026-01-26 21:49:14,677] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:58298] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-26 21:49:14,695] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:58299] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-26 21:49:15,851] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:58298] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
[2026-01-26 21:49:15,857] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:58299] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][ATokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:30<26:40,  3.48 examples/s]
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:30<27:03,  3.43 examples/s][ATokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:30<11:01,  8.27 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:31<05:56, 15.04 examples/s]
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:31<11:06,  8.20 examples/s][ATokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:31<03:39, 23.98 examples/s]
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:31<06:03, 14.73 examples/s][ATokenizing Prompts (num_proc=54):   9%|██████████████▌                                                                                                                                             | 530/5677 [00:31<02:23, 35.80 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▍                                                                                                                                          | 636/5677 [00:32<01:37, 51.92 examples/s]
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:32<03:45, 23.27 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▌                                                                                                                                             | 530/5677 [00:32<02:23, 35.76 examples/s][ATokenizing Prompts (num_proc=54):  13%|████████████████████▍                                                                                                                                       | 742/5677 [00:32<01:09, 71.27 examples/s]
Tokenizing Prompts (num_proc=54):  11%|█████████████████▍                                                                                                                                          | 636/5677 [00:32<01:38, 51.17 examples/s][ATokenizing Prompts (num_proc=54):  15%|███████████████████████▎                                                                                                                                    | 847/5677 [00:32<00:51, 94.12 examples/s]
Tokenizing Prompts (num_proc=54):  13%|████████████████████▍                                                                                                                                       | 742/5677 [00:32<01:10, 70.42 examples/s][ATokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:33<00:39, 119.59 examples/s]
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▎                                                                                                                                    | 847/5677 [00:33<00:52, 92.11 examples/s][ATokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:33<00:30, 149.89 examples/s]
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:33<00:39, 118.93 examples/s][ATokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:33<00:25, 177.70 examples/s]
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:33<00:31, 147.14 examples/s][ATokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:34<00:21, 204.43 examples/s]
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:34<00:25, 174.83 examples/s][ATokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:34<00:19, 220.43 examples/s]
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:34<00:21, 203.14 examples/s][ATokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:34<00:16, 249.92 examples/s]
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:34<00:19, 225.88 examples/s][ATokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:35<00:15, 264.86 examples/s]
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:35<00:17, 245.29 examples/s][ATokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:35<00:14, 276.53 examples/s]
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:35<00:16, 252.75 examples/s][ATokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:35<00:13, 284.63 examples/s]
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:36<00:15, 261.39 examples/s][ATokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:36<00:12, 291.54 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:36<00:12, 297.54 examples/s]
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:36<00:15, 251.52 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:36<00:12, 307.54 examples/s][ATokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:37<00:13, 269.10 examples/s]
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:37<00:12, 295.17 examples/s][ATokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:37<00:11, 310.05 examples/s]
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:37<00:11, 320.01 examples/s][ATokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:37<00:10, 321.48 examples/s]
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:37<00:10, 318.19 examples/s][ATokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:37<00:10, 315.54 examples/s]
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:38<00:10, 316.41 examples/s][ATokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:38<00:10, 314.60 examples/s]
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:38<00:10, 315.89 examples/s][ATokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:38<00:09, 312.88 examples/s]
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:38<00:09, 315.59 examples/s][ATokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:38<00:09, 311.47 examples/s]
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:39<00:09, 316.36 examples/s][ATokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:39<00:09, 309.66 examples/s]
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:39<00:09, 306.46 examples/s][ATokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:39<00:08, 309.04 examples/s]
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:39<00:09, 313.80 examples/s][ATokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:39<00:08, 307.81 examples/s]
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:40<00:09, 283.02 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:40<00:08, 321.97 examples/s][ATokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:40<00:11, 221.87 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:41<00:07, 317.65 examples/s]
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:41<00:10, 230.12 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:41<00:06, 360.74 examples/s][ATokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:41<00:08, 250.78 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:42<00:07, 266.81 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:42<00:07, 278.59 examples/s]
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:42<00:09, 227.61 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:42<00:07, 278.26 examples/s][ATokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:42<00:06, 275.70 examples/s]
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:42<00:06, 290.40 examples/s][ATokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:43<00:05, 307.58 examples/s]
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:43<00:06, 293.84 examples/s][ATokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:43<00:05, 289.46 examples/s]
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:43<00:05, 303.50 examples/s][ATokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:43<00:05, 312.67 examples/s]
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:43<00:05, 300.20 examples/s][ATokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:44<00:04, 294.50 examples/s]
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:44<00:05, 309.89 examples/s][ATokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:44<00:04, 318.58 examples/s]
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:44<00:04, 310.07 examples/s][ATokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:44<00:04, 300.22 examples/s]
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:44<00:04, 300.71 examples/s][ATokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:45<00:03, 319.44 examples/s]
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:45<00:03, 315.05 examples/s][ATokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:45<00:03, 324.40 examples/s]
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:45<00:03, 319.44 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:45<00:03, 324.65 examples/s][ATokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:46<00:03, 249.27 examples/s]
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:46<00:03, 310.45 examples/s][ATokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:46<00:02, 341.78 examples/s]
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:46<00:02, 325.67 examples/s][ATokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:46<00:01, 333.16 examples/s]
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:46<00:02, 327.70 examples/s][ATokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:47<00:01, 327.44 examples/s]
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:47<00:01, 323.06 examples/s][ATokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:47<00:01, 324.41 examples/s]
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:47<00:01, 320.94 examples/s][ATokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:47<00:01, 314.00 examples/s]
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:47<00:01, 310.46 examples/s][ATokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:47<00:00, 357.33 examples/s]
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:48<00:00, 317.63 examples/s][ATokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:48<00:00, 326.38 examples/s]
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:48<00:00, 293.13 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:48<00:00, 339.99 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:48<00:00, 307.06 examples/s]
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:48<00:00, 378.59 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:51<00:00, 109.74 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:51<00:00, 109.35 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][ADropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 867.66 examples/s]
Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:04, 958.04 examples/s][ADropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1298.16 examples/s]
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1317.11 examples/s][ADropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1536.15 examples/s]
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1535.51 examples/s][ADropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1598.56 examples/s]
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1636.38 examples/s][ADropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1672.25 examples/s]
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1729.53 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1729.86 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1546.24 examples/s]
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1762.30 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1585.07 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][AAdd position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1283.30 examples/s]
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1348.33 examples/s][AAdd position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1957.83 examples/s]
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1995.43 examples/s][AAdd position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2371.24 examples/s]
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2351.48 examples/s][AAdd position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2642.56 examples/s]
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2599.05 examples/s][AAdd position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2816.73 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2412.90 examples/s]
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2735.91 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2390.34 examples/s]
[2026-01-26 21:50:14,162] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:58299] Using single process for pack_parallel, running sequentially.

[2026-01-26 21:50:19,234] [WARNING] [py.warnings._showwarnmsg:109] [PID:58298] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

[2026-01-26 21:50:19,351] [WARNING] [py.warnings._showwarnmsg:109] [PID:58299] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

  0%|                                                                                                                                                                                                  | 1/5680 [02:08<202:31:09, 128.38s/it]                                                                                                                                                                                                                                             {'loss': '1.445', 'grad_norm': '0.4259', 'learning_rate': '0.0002', 'ppl': '4.241', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '130.7', 'tokens/total': 8192, 'tokens/trainable': 8166, 'epoch': '0.0001761'}
  0%|                                                                                                                                                                                                  | 1/5680 [02:08<202:31:09, 128.38s/it]  0%|                                                                                                                                                                                                    | 2/5680 [02:16<90:46:45, 57.56s/it]                                                                                                                                                                                                                                             {'loss': '1.333', 'grad_norm': '0.3303', 'learning_rate': '0.0002', 'ppl': '3.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 16384, 'tokens/trainable': 16342, 'epoch': '0.0003521'}
  0%|                                                                                                                                                                                                    | 2/5680 [02:16<90:46:45, 57.56s/it]  0%|                                                                                                                                                                                                    | 3/5680 [02:24<55:05:05, 34.93s/it]                                                                                                                                                                                                                                             {'loss': '1.535', 'grad_norm': '0.3587', 'learning_rate': '0.0002', 'ppl': '4.643', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 24576, 'tokens/trainable': 24518, 'epoch': '0.0005282'}
  0%|                                                                                                                                                                                                    | 3/5680 [02:24<55:05:05, 34.93s/it]  0%|▏                                                                                                                                                                                                   | 4/5680 [02:32<38:20:06, 24.31s/it]                                                                                                                                                                                                                                             {'loss': '1.4', 'grad_norm': '0.3384', 'learning_rate': '0.0002', 'ppl': '4.057', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 32768, 'tokens/trainable': 32677, 'epoch': '0.0007042'}
  0%|▏                                                                                                                                                                                                   | 4/5680 [02:32<38:20:06, 24.31s/it]  0%|▏                                                                                                                                                                                                   | 5/5680 [02:40<29:05:26, 18.45s/it]                                                                                                                                                                                                                                             {'loss': '1.283', 'grad_norm': '0.3921', 'learning_rate': '0.0002', 'ppl': '3.606', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 40960, 'tokens/trainable': 40811, 'epoch': '0.0008803'}
  0%|▏                                                                                                                                                                                                   | 5/5680 [02:40<29:05:26, 18.45s/it]  0%|▏                                                                                                                                                                                                   | 6/5680 [02:48<23:31:25, 14.93s/it]                                                                                                                                                                                                                                             {'loss': '1.143', 'grad_norm': '0.2735', 'learning_rate': '0.0002', 'ppl': '3.138', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 49152, 'tokens/trainable': 48979, 'epoch': '0.001056'}
  0%|▏                                                                                                                                                                                                   | 6/5680 [02:48<23:31:25, 14.93s/it]  0%|▏                                                                                                                                                                                                   | 7/5680 [02:56<19:57:04, 12.66s/it]                                                                                                                                                                                                                                             {'loss': '1.223', 'grad_norm': '0.312', 'learning_rate': '0.0002', 'ppl': '3.398', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 57344, 'tokens/trainable': 57170, 'epoch': '0.001232'}
  0%|▏                                                                                                                                                                                                   | 7/5680 [02:56<19:57:04, 12.66s/it]  0%|▎                                                                                                                                                                                                   | 8/5680 [03:04<17:39:07, 11.20s/it]                                                                                                                                                                                                                                             {'loss': '1.136', 'grad_norm': '0.2928', 'learning_rate': '0.0002', 'ppl': '3.115', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 65536, 'tokens/trainable': 65358, 'epoch': '0.001408'}
  0%|▎                                                                                                                                                                                                   | 8/5680 [03:04<17:39:07, 11.20s/it]  0%|▎                                                                                                                                                                                                   | 9/5680 [03:12<16:04:46, 10.21s/it]                                                                                                                                                                                                                                             {'loss': '0.946', 'grad_norm': '0.299', 'learning_rate': '0.0002', 'ppl': '2.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 73728, 'tokens/trainable': 73529, 'epoch': '0.001585'}
  0%|▎                                                                                                                                                                                                   | 9/5680 [03:12<16:04:46, 10.21s/it]  0%|▎                                                                                                                                                                                                  | 10/5680 [03:20<15:00:56,  9.53s/it]                                                                                                                                                                                                                                             {'loss': '1.226', 'grad_norm': '0.2898', 'learning_rate': '0.0002', 'ppl': '3.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 81920, 'tokens/trainable': 81694, 'epoch': '0.001761'}
  0%|▎                                                                                                                                                                                                  | 10/5680 [03:20<15:00:56,  9.53s/it]  0%|▍                                                                                                                                                                                                  | 11/5680 [03:28<14:16:26,  9.06s/it]                                                                                                                                                                                                                                             {'loss': '0.973', 'grad_norm': '0.2426', 'learning_rate': '0.0002', 'ppl': '2.646', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 90112, 'tokens/trainable': 89878, 'epoch': '0.001937'}
  0%|▍                                                                                                                                                                                                  | 11/5680 [03:28<14:16:26,  9.06s/it]  0%|▍                                                                                                                                                                                                  | 12/5680 [03:36<13:45:13,  8.74s/it]                                                                                                                                                                                                                                             {'loss': '0.9319', 'grad_norm': '0.2668', 'learning_rate': '0.0002', 'ppl': '2.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 98304, 'tokens/trainable': 98054, 'epoch': '0.002113'}
  0%|▍                                                                                                                                                                                                  | 12/5680 [03:36<13:45:13,  8.74s/it]  0%|▍                                                                                                                                                                                                  | 13/5680 [03:44<13:24:17,  8.52s/it]                                                                                                                                                                                                                                             {'loss': '0.8048', 'grad_norm': '0.259', 'learning_rate': '0.0002', 'ppl': '2.236', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 106496, 'tokens/trainable': 106229, 'epoch': '0.002289'}
  0%|▍                                                                                                                                                                                                  | 13/5680 [03:44<13:24:17,  8.52s/it]  0%|▍                                                                                                                                                                                                  | 14/5680 [03:52<13:09:58,  8.37s/it]                                                                                                                                                                                                                                             {'loss': '1.043', 'grad_norm': '0.2473', 'learning_rate': '0.0002', 'ppl': '2.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 114688, 'tokens/trainable': 114378, 'epoch': '0.002465'}
  0%|▍                                                                                                                                                                                                  | 14/5680 [03:52<13:09:58,  8.37s/it]  0%|▌                                                                                                                                                                                                  | 15/5680 [04:00<12:59:55,  8.26s/it]                                                                                                                                                                                                                                             {'loss': '1.341', 'grad_norm': '0.3424', 'learning_rate': '0.0002', 'ppl': '3.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 122880, 'tokens/trainable': 122562, 'epoch': '0.002641'}
  0%|▌                                                                                                                                                                                                  | 15/5680 [04:00<12:59:55,  8.26s/it]  0%|▌                                                                                                                                                                                                  | 16/5680 [04:08<12:54:03,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.841', 'grad_norm': '0.247', 'learning_rate': '0.0002', 'ppl': '2.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 131072, 'tokens/trainable': 130705, 'epoch': '0.002817'}
  0%|▌                                                                                                                                                                                                  | 16/5680 [04:08<12:54:03,  8.20s/it]  0%|▌                                                                                                                                                                                                  | 17/5680 [04:16<12:50:51,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '1.178', 'grad_norm': '0.2913', 'learning_rate': '0.0002', 'ppl': '3.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 139264, 'tokens/trainable': 138856, 'epoch': '0.002993'}
  0%|▌                                                                                                                                                                                                  | 17/5680 [04:16<12:50:51,  8.17s/it]  0%|▌                                                                                                                                                                                                  | 18/5680 [04:24<12:48:27,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.7724', 'grad_norm': '0.2356', 'learning_rate': '0.0002', 'ppl': '2.165', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 147456, 'tokens/trainable': 146982, 'epoch': '0.003169'}
  0%|▌                                                                                                                                                                                                  | 18/5680 [04:24<12:48:27,  8.14s/it]  0%|▋                                                                                                                                                                                                  | 19/5680 [04:32<12:42:06,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.9225', 'grad_norm': '0.2434', 'learning_rate': '0.0002', 'ppl': '2.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 155648, 'tokens/trainable': 155142, 'epoch': '0.003345'}
  0%|▋                                                                                                                                                                                                  | 19/5680 [04:32<12:42:06,  8.08s/it]  0%|▋                                                                                                                                                                                                  | 20/5680 [04:40<12:37:54,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.6553', 'grad_norm': '0.2174', 'learning_rate': '0.0002', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 163840, 'tokens/trainable': 163260, 'epoch': '0.003521'}
  0%|▋                                                                                                                                                                                                  | 20/5680 [04:40<12:37:54,  8.03s/it]  0%|▋                                                                                                                                                                                                  | 21/5680 [04:48<12:34:07,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.9019', 'grad_norm': '0.2537', 'learning_rate': '0.0002', 'ppl': '2.464', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 172032, 'tokens/trainable': 171366, 'epoch': '0.003697'}
  0%|▋                                                                                                                                                                                                  | 21/5680 [04:48<12:34:07,  8.00s/it]  0%|▊                                                                                                                                                                                                  | 22/5680 [04:56<12:31:15,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.9838', 'grad_norm': '0.2551', 'learning_rate': '0.0002', 'ppl': '2.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 180224, 'tokens/trainable': 179526, 'epoch': '0.003873'}
  0%|▊                                                                                                                                                                                                  | 22/5680 [04:56<12:31:15,  7.97s/it]  0%|▊                                                                                                                                                                                                  | 23/5680 [05:04<12:29:06,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7446', 'grad_norm': '0.2132', 'learning_rate': '0.0002', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 188416, 'tokens/trainable': 187664, 'epoch': '0.004049'}
  0%|▊                                                                                                                                                                                                  | 23/5680 [05:04<12:29:06,  7.95s/it]  0%|▊                                                                                                                                                                                                  | 24/5680 [05:12<12:27:52,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.9457', 'grad_norm': '0.2212', 'learning_rate': '0.0002', 'ppl': '2.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 196608, 'tokens/trainable': 195818, 'epoch': '0.004225'}
  0%|▊                                                                                                                                                                                                  | 24/5680 [05:12<12:27:52,  7.93s/it]  0%|▊                                                                                                                                                                                                  | 25/5680 [05:20<12:24:07,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8895', 'grad_norm': '0.2127', 'learning_rate': '0.0002', 'ppl': '2.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 204800, 'tokens/trainable': 203973, 'epoch': '0.004401'}
  0%|▊                                                                                                                                                                                                  | 25/5680 [05:20<12:24:07,  7.90s/it]  0%|▉                                                                                                                                                                                                  | 26/5680 [05:28<12:23:38,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7922', 'grad_norm': '0.1865', 'learning_rate': '0.0002', 'ppl': '2.208', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 212992, 'tokens/trainable': 212145, 'epoch': '0.004577'}
  0%|▉                                                                                                                                                                                                  | 26/5680 [05:28<12:23:38,  7.89s/it]  0%|▉                                                                                                                                                                                                  | 27/5680 [05:35<12:23:37,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.435', 'grad_norm': '0.2647', 'learning_rate': '0.0002', 'ppl': '4.201', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 221184, 'tokens/trainable': 220295, 'epoch': '0.004754'}
  0%|▉                                                                                                                                                                                                  | 27/5680 [05:35<12:23:37,  7.89s/it]  0%|▉                                                                                                                                                                                                  | 28/5680 [05:43<12:24:19,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6418', 'grad_norm': '0.1785', 'learning_rate': '0.0002', 'ppl': '1.9', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 229376, 'tokens/trainable': 228475, 'epoch': '0.00493'}
  0%|▉                                                                                                                                                                                                  | 28/5680 [05:43<12:24:19,  7.90s/it]  1%|▉                                                                                                                                                                                                  | 29/5680 [05:51<12:22:58,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.387', 'grad_norm': '0.2811', 'learning_rate': '0.0002', 'ppl': '4.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 237568, 'tokens/trainable': 236617, 'epoch': '0.005106'}
  1%|▉                                                                                                                                                                                                  | 29/5680 [05:51<12:22:58,  7.89s/it]  1%|█                                                                                                                                                                                                  | 30/5680 [05:59<12:21:54,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '1.221', 'grad_norm': '0.2545', 'learning_rate': '0.0002', 'ppl': '3.391', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 245760, 'tokens/trainable': 244799, 'epoch': '0.005282'}
  1%|█                                                                                                                                                                                                  | 30/5680 [05:59<12:21:54,  7.88s/it]  1%|█                                                                                                                                                                                                  | 31/5680 [06:07<12:21:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9207', 'grad_norm': '0.2043', 'learning_rate': '0.0002', 'ppl': '2.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 253952, 'tokens/trainable': 252952, 'epoch': '0.005458'}
  1%|█                                                                                                                                                                                                  | 31/5680 [06:07<12:21:14,  7.87s/it]  1%|█                                                                                                                                                                                                  | 32/5680 [06:16<12:40:04,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.8875', 'grad_norm': '0.2421', 'learning_rate': '0.0002', 'ppl': '2.429', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949', 'tokens/total': 262144, 'tokens/trainable': 261059, 'epoch': '0.005634'}
  1%|█                                                                                                                                                                                                  | 32/5680 [06:16<12:40:04,  8.07s/it]  1%|█▏                                                                                                                                                                                                 | 33/5680 [06:24<12:37:45,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.7778', 'grad_norm': '0.1844', 'learning_rate': '0.0002', 'ppl': '2.177', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 270336, 'tokens/trainable': 269173, 'epoch': '0.00581'}
  1%|█▏                                                                                                                                                                                                 | 33/5680 [06:24<12:37:45,  8.05s/it]  1%|█▏                                                                                                                                                                                                 | 34/5680 [06:31<12:33:16,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.618', 'grad_norm': '0.1928', 'learning_rate': '0.0002', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 278528, 'tokens/trainable': 277356, 'epoch': '0.005986'}
  1%|█▏                                                                                                                                                                                                 | 34/5680 [06:31<12:33:16,  8.01s/it]  1%|█▏                                                                                                                                                                                                 | 35/5680 [06:39<12:30:23,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5827', 'grad_norm': '0.1832', 'learning_rate': '0.0002', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 286720, 'tokens/trainable': 285509, 'epoch': '0.006162'}
  1%|█▏                                                                                                                                                                                                 | 35/5680 [06:39<12:30:23,  7.98s/it]  1%|█▏                                                                                                                                                                                                 | 36/5680 [06:47<12:27:09,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.136', 'grad_norm': '0.2343', 'learning_rate': '0.0002', 'ppl': '3.115', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 294912, 'tokens/trainable': 293638, 'epoch': '0.006338'}
  1%|█▏                                                                                                                                                                                                 | 36/5680 [06:47<12:27:09,  7.94s/it]  1%|█▎                                                                                                                                                                                                 | 37/5680 [06:55<12:24:19,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '1.236', 'grad_norm': '0.2477', 'learning_rate': '0.0002', 'ppl': '3.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 303104, 'tokens/trainable': 301757, 'epoch': '0.006514'}
  1%|█▎                                                                                                                                                                                                 | 37/5680 [06:55<12:24:19,  7.91s/it]  1%|█▎                                                                                                                                                                                                 | 38/5680 [07:03<12:23:28,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.9226', 'grad_norm': '0.2185', 'learning_rate': '0.0002', 'ppl': '2.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 311296, 'tokens/trainable': 309905, 'epoch': '0.00669'}
  1%|█▎                                                                                                                                                                                                 | 38/5680 [07:03<12:23:28,  7.91s/it]  1%|█▎                                                                                                                                                                                                 | 39/5680 [07:11<12:22:17,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5298', 'grad_norm': '0.1537', 'learning_rate': '0.0002', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 319488, 'tokens/trainable': 318059, 'epoch': '0.006866'}
  1%|█▎                                                                                                                                                                                                 | 39/5680 [07:11<12:22:17,  7.90s/it]  1%|█▎                                                                                                                                                                                                 | 40/5680 [07:19<12:31:36,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.8167', 'grad_norm': '0.2048', 'learning_rate': '0.0002', 'ppl': '2.263', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.4', 'tokens/total': 327680, 'tokens/trainable': 326208, 'epoch': '0.007042'}
  1%|█▎                                                                                                                                                                                                 | 40/5680 [07:19<12:31:36,  8.00s/it]  1%|█▍                                                                                                                                                                                                 | 41/5680 [07:27<12:28:19,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7873', 'grad_norm': '0.1866', 'learning_rate': '0.0002', 'ppl': '2.198', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 335872, 'tokens/trainable': 334367, 'epoch': '0.007218'}
  1%|█▍                                                                                                                                                                                                 | 41/5680 [07:27<12:28:19,  7.96s/it]  1%|█▍                                                                                                                                                                                                 | 42/5680 [07:35<12:26:03,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.168', 'grad_norm': '0.2436', 'learning_rate': '0.0002', 'ppl': '3.216', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 344064, 'tokens/trainable': 342526, 'epoch': '0.007394'}
  1%|█▍                                                                                                                                                                                                 | 42/5680 [07:35<12:26:03,  7.94s/it]  1%|█▍                                                                                                                                                                                                 | 43/5680 [07:43<12:26:06,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.076', 'grad_norm': '0.2272', 'learning_rate': '0.0002', 'ppl': '2.933', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 352256, 'tokens/trainable': 350683, 'epoch': '0.00757'}
  1%|█▍                                                                                                                                                                                                 | 43/5680 [07:43<12:26:06,  7.94s/it]  1%|█▌                                                                                                                                                                                                 | 44/5680 [07:51<12:26:38,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8376', 'grad_norm': '0.1953', 'learning_rate': '0.0002', 'ppl': '2.311', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 360448, 'tokens/trainable': 358836, 'epoch': '0.007746'}
  1%|█▌                                                                                                                                                                                                 | 44/5680 [07:51<12:26:38,  7.95s/it]  1%|█▌                                                                                                                                                                                                 | 45/5680 [07:59<12:26:29,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '1.14', 'grad_norm': '0.2562', 'learning_rate': '0.0002', 'ppl': '3.126', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 368640, 'tokens/trainable': 366965, 'epoch': '0.007923'}
  1%|█▌                                                                                                                                                                                                 | 45/5680 [07:59<12:26:29,  7.95s/it]  1%|█▌                                                                                                                                                                                                 | 46/5680 [08:07<12:25:23,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.9307', 'grad_norm': '0.2256', 'learning_rate': '0.0002', 'ppl': '2.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 376832, 'tokens/trainable': 375106, 'epoch': '0.008099'}
  1%|█▌                                                                                                                                                                                                 | 46/5680 [08:07<12:25:23,  7.94s/it]  1%|█▌                                                                                                                                                                                                 | 47/5680 [08:14<12:24:08,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6325', 'grad_norm': '0.2302', 'learning_rate': '0.0002', 'ppl': '1.882', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 385024, 'tokens/trainable': 383272, 'epoch': '0.008275'}
  1%|█▌                                                                                                                                                                                                 | 47/5680 [08:14<12:24:08,  7.93s/it]  1%|█▋                                                                                                                                                                                                 | 48/5680 [08:22<12:22:16,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7047', 'grad_norm': '0.184', 'learning_rate': '0.0002', 'ppl': '2.023', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 393216, 'tokens/trainable': 391455, 'epoch': '0.008451'}
  1%|█▋                                                                                                                                                                                                 | 48/5680 [08:22<12:22:16,  7.91s/it]  1%|█▋                                                                                                                                                                                                 | 49/5680 [08:31<12:35:33,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '1.102', 'grad_norm': '0.2551', 'learning_rate': '0.0002', 'ppl': '3.009', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.4', 'tokens/total': 401408, 'tokens/trainable': 399623, 'epoch': '0.008627'}
  1%|█▋                                                                                                                                                                                                 | 49/5680 [08:31<12:35:33,  8.05s/it]  1%|█▋                                                                                                                                                                                                 | 50/5680 [08:39<12:29:51,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.9401', 'grad_norm': '0.2238', 'learning_rate': '0.0002', 'ppl': '2.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 409600, 'tokens/trainable': 407815, 'epoch': '0.008803'}
  1%|█▋                                                                                                                                                                                                 | 50/5680 [08:39<12:29:51,  7.99s/it]  1%|█▊                                                                                                                                                                                                 | 51/5680 [08:46<12:27:08,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.602', 'grad_norm': '0.1838', 'learning_rate': '0.0002', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 417792, 'tokens/trainable': 415945, 'epoch': '0.008979'}
  1%|█▊                                                                                                                                                                                                 | 51/5680 [08:46<12:27:08,  7.96s/it]  1%|█▊                                                                                                                                                                                                 | 52/5680 [08:54<12:25:04,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7277', 'grad_norm': '0.2027', 'learning_rate': '0.0002', 'ppl': '2.07', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 425984, 'tokens/trainable': 424097, 'epoch': '0.009155'}
  1%|█▊                                                                                                                                                                                                 | 52/5680 [08:54<12:25:04,  7.94s/it]  1%|█▊                                                                                                                                                                                                 | 53/5680 [09:02<12:23:01,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '1.055', 'grad_norm': '0.2672', 'learning_rate': '0.0002', 'ppl': '2.871', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 434176, 'tokens/trainable': 432255, 'epoch': '0.009331'}
  1%|█▊                                                                                                                                                                                                 | 53/5680 [09:02<12:23:01,  7.92s/it]  1%|█▊                                                                                                                                                                                                 | 54/5680 [09:10<12:22:18,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.762', 'grad_norm': '0.2009', 'learning_rate': '0.0002', 'ppl': '2.142', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 442368, 'tokens/trainable': 440395, 'epoch': '0.009507'}
  1%|█▊                                                                                                                                                                                                 | 54/5680 [09:10<12:22:18,  7.92s/it]  1%|█▉                                                                                                                                                                                                 | 55/5680 [09:18<12:21:33,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5444', 'grad_norm': '0.1704', 'learning_rate': '0.0002', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 450560, 'tokens/trainable': 448543, 'epoch': '0.009683'}
  1%|█▉                                                                                                                                                                                                 | 55/5680 [09:18<12:21:33,  7.91s/it]  1%|█▉                                                                                                                                                                                                 | 56/5680 [09:26<12:19:26,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.243', 'grad_norm': '0.2781', 'learning_rate': '0.0002', 'ppl': '3.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 458752, 'tokens/trainable': 456667, 'epoch': '0.009859'}
  1%|█▉                                                                                                                                                                                                 | 56/5680 [09:26<12:19:26,  7.89s/it]  1%|█▉                                                                                                                                                                                                 | 57/5680 [09:34<12:19:19,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.061', 'grad_norm': '0.2538', 'learning_rate': '0.0002', 'ppl': '2.888', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 466944, 'tokens/trainable': 464809, 'epoch': '0.01004'}
  1%|█▉                                                                                                                                                                                                 | 57/5680 [09:34<12:19:19,  7.89s/it]  1%|█▉                                                                                                                                                                                                 | 58/5680 [09:42<12:18:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8273', 'grad_norm': '0.2284', 'learning_rate': '0.0002', 'ppl': '2.287', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 475136, 'tokens/trainable': 472978, 'epoch': '0.01021'}
  1%|█▉                                                                                                                                                                                                 | 58/5680 [09:42<12:18:25,  7.88s/it]  1%|██                                                                                                                                                                                                 | 59/5680 [09:50<12:19:35,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5447', 'grad_norm': '0.1998', 'learning_rate': '0.0001999', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 483328, 'tokens/trainable': 481146, 'epoch': '0.01039'}
  1%|██                                                                                                                                                                                                 | 59/5680 [09:50<12:19:35,  7.89s/it]  1%|██                                                                                                                                                                                                 | 60/5680 [09:58<12:26:51,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6048', 'grad_norm': '0.1858', 'learning_rate': '0.0001999', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 491520, 'tokens/trainable': 489331, 'epoch': '0.01056'}
  1%|██                                                                                                                                                                                                 | 60/5680 [09:58<12:26:51,  7.97s/it]  1%|██                                                                                                                                                                                                 | 61/5680 [10:06<12:30:40,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5936', 'grad_norm': '0.1875', 'learning_rate': '0.0001999', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 499712, 'tokens/trainable': 497505, 'epoch': '0.01074'}
  1%|██                                                                                                                                                                                                 | 61/5680 [10:06<12:30:40,  8.02s/it]  1%|██▏                                                                                                                                                                                                | 62/5680 [10:14<12:34:23,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '1.091', 'grad_norm': '0.247', 'learning_rate': '0.0001999', 'ppl': '2.976', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.9', 'tokens/total': 507904, 'tokens/trainable': 505654, 'epoch': '0.01092'}
  1%|██▏                                                                                                                                                                                                | 62/5680 [10:14<12:34:23,  8.06s/it]  1%|██▏                                                                                                                                                                                                | 63/5680 [10:22<12:36:16,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.4963', 'grad_norm': '0.1802', 'learning_rate': '0.0001999', 'ppl': '1.643', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.2', 'tokens/total': 516096, 'tokens/trainable': 513774, 'epoch': '0.01109'}
  1%|██▏                                                                                                                                                                                                | 63/5680 [10:22<12:36:16,  8.08s/it]  1%|██▏                                                                                                                                                                                                | 64/5680 [10:30<12:38:10,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '1.052', 'grad_norm': '0.2285', 'learning_rate': '0.0001999', 'ppl': '2.864', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.4', 'tokens/total': 524288, 'tokens/trainable': 521909, 'epoch': '0.01127'}
  1%|██▏                                                                                                                                                                                                | 64/5680 [10:30<12:38:10,  8.10s/it]  1%|██▏                                                                                                                                                                                                | 65/5680 [10:38<12:39:53,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.8017', 'grad_norm': '0.2484', 'learning_rate': '0.0001999', 'ppl': '2.229', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.7', 'tokens/total': 532480, 'tokens/trainable': 530029, 'epoch': '0.01144'}
  1%|██▏                                                                                                                                                                                                | 65/5680 [10:38<12:39:53,  8.12s/it]  1%|██▎                                                                                                                                                                                                | 66/5680 [10:47<12:39:00,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.791', 'grad_norm': '0.1989', 'learning_rate': '0.0001999', 'ppl': '2.206', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 540672, 'tokens/trainable': 538145, 'epoch': '0.01162'}
  1%|██▎                                                                                                                                                                                                | 66/5680 [10:47<12:39:00,  8.11s/it]  1%|██▎                                                                                                                                                                                                | 67/5680 [10:55<12:39:59,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.9857', 'grad_norm': '0.2484', 'learning_rate': '0.0001999', 'ppl': '2.68', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 548864, 'tokens/trainable': 546330, 'epoch': '0.0118'}
  1%|██▎                                                                                                                                                                                                | 67/5680 [10:55<12:39:59,  8.12s/it]  1%|██▎                                                                                                                                                                                                | 68/5680 [11:03<12:37:44,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.8307', 'grad_norm': '0.2371', 'learning_rate': '0.0001999', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 557056, 'tokens/trainable': 554466, 'epoch': '0.01197'}
  1%|██▎                                                                                                                                                                                                | 68/5680 [11:03<12:37:44,  8.10s/it]  1%|██▎                                                                                                                                                                                                | 69/5680 [11:11<12:36:31,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.5907', 'grad_norm': '0.1941', 'learning_rate': '0.0001999', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 565248, 'tokens/trainable': 562621, 'epoch': '0.01215'}
  1%|██▎                                                                                                                                                                                                | 69/5680 [11:11<12:36:31,  8.09s/it]  1%|██▍                                                                                                                                                                                                | 70/5680 [11:19<12:35:57,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '1.041', 'grad_norm': '0.2611', 'learning_rate': '0.0001999', 'ppl': '2.833', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 573440, 'tokens/trainable': 570796, 'epoch': '0.01232'}
  1%|██▍                                                                                                                                                                                                | 70/5680 [11:19<12:35:57,  8.09s/it]  1%|██▍                                                                                                                                                                                                | 71/5680 [11:27<12:36:30,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.5737', 'grad_norm': '0.1904', 'learning_rate': '0.0001999', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 581632, 'tokens/trainable': 578955, 'epoch': '0.0125'}
  1%|██▍                                                                                                                                                                                                | 71/5680 [11:27<12:36:30,  8.09s/it]  1%|██▍                                                                                                                                                                                                | 72/5680 [11:35<12:38:59,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '1.041', 'grad_norm': '0.2313', 'learning_rate': '0.0001999', 'ppl': '2.832', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.4', 'tokens/total': 589824, 'tokens/trainable': 587099, 'epoch': '0.01268'}
  1%|██▍                                                                                                                                                                                                | 72/5680 [11:35<12:38:59,  8.12s/it]  1%|██▌                                                                                                                                                                                                | 73/5680 [11:43<12:40:21,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.6836', 'grad_norm': '0.1903', 'learning_rate': '0.0001999', 'ppl': '1.981', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 598016, 'tokens/trainable': 595272, 'epoch': '0.01285'}
  1%|██▌                                                                                                                                                                                                | 73/5680 [11:43<12:40:21,  8.14s/it]  1%|██▌                                                                                                                                                                                                | 74/5680 [11:51<12:36:23,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.91', 'grad_norm': '0.241', 'learning_rate': '0.0001999', 'ppl': '2.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 606208, 'tokens/trainable': 603453, 'epoch': '0.01303'}
  1%|██▌                                                                                                                                                                                                | 74/5680 [11:51<12:36:23,  8.10s/it]  1%|██▌                                                                                                                                                                                                | 75/5680 [11:59<12:32:48,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9051', 'grad_norm': '0.2311', 'learning_rate': '0.0001999', 'ppl': '2.472', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 614400, 'tokens/trainable': 611564, 'epoch': '0.0132'}
  1%|██▌                                                                                                                                                                                                | 75/5680 [11:59<12:32:48,  8.06s/it]  1%|██▌                                                                                                                                                                                                | 76/5680 [12:07<12:32:08,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.9113', 'grad_norm': '0.249', 'learning_rate': '0.0001999', 'ppl': '2.487', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 622592, 'tokens/trainable': 619747, 'epoch': '0.01338'}
  1%|██▌                                                                                                                                                                                                | 76/5680 [12:07<12:32:08,  8.05s/it]  1%|██▋                                                                                                                                                                                                | 77/5680 [12:15<12:29:55,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.9311', 'grad_norm': '0.2402', 'learning_rate': '0.0001999', 'ppl': '2.537', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 630784, 'tokens/trainable': 627912, 'epoch': '0.01356'}
  1%|██▋                                                                                                                                                                                                | 77/5680 [12:15<12:29:55,  8.03s/it]  1%|██▋                                                                                                                                                                                                | 78/5680 [12:23<12:32:49,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.8161', 'grad_norm': '0.2197', 'learning_rate': '0.0001999', 'ppl': '2.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 638976, 'tokens/trainable': 636092, 'epoch': '0.01373'}
  1%|██▋                                                                                                                                                                                                | 78/5680 [12:23<12:32:49,  8.06s/it]  1%|██▋                                                                                                                                                                                                | 79/5680 [12:32<12:34:40,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.8541', 'grad_norm': '0.2321', 'learning_rate': '0.0001999', 'ppl': '2.349', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 647168, 'tokens/trainable': 644274, 'epoch': '0.01391'}
  1%|██▋                                                                                                                                                                                                | 79/5680 [12:32<12:34:40,  8.08s/it]  1%|██▋                                                                                                                                                                                                | 80/5680 [12:40<12:38:33,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.9201', 'grad_norm': '0.218', 'learning_rate': '0.0001999', 'ppl': '2.509', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.3', 'tokens/total': 655360, 'tokens/trainable': 652420, 'epoch': '0.01408'}
  1%|██▋                                                                                                                                                                                                | 80/5680 [12:40<12:38:33,  8.13s/it]  1%|██▊                                                                                                                                                                                                | 81/5680 [12:48<12:37:13,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '1.162', 'grad_norm': '0.2275', 'learning_rate': '0.0001999', 'ppl': '3.195', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.9', 'tokens/total': 663552, 'tokens/trainable': 660501, 'epoch': '0.01426'}
  1%|██▊                                                                                                                                                                                                | 81/5680 [12:48<12:37:13,  8.11s/it]  1%|██▊                                                                                                                                                                                                | 82/5680 [12:56<12:34:03,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '1.008', 'grad_norm': '0.2176', 'learning_rate': '0.0001999', 'ppl': '2.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 671744, 'tokens/trainable': 668637, 'epoch': '0.01444'}
  1%|██▊                                                                                                                                                                                                | 82/5680 [12:56<12:34:03,  8.08s/it]  1%|██▊                                                                                                                                                                                                | 83/5680 [13:04<12:31:01,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.7498', 'grad_norm': '0.2062', 'learning_rate': '0.0001999', 'ppl': '2.117', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 679936, 'tokens/trainable': 676801, 'epoch': '0.01461'}
  1%|██▊                                                                                                                                                                                                | 83/5680 [13:04<12:31:01,  8.05s/it]  1%|██▉                                                                                                                                                                                                | 84/5680 [13:12<12:29:26,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.8545', 'grad_norm': '0.2208', 'learning_rate': '0.0001999', 'ppl': '2.35', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 688128, 'tokens/trainable': 684915, 'epoch': '0.01479'}
  1%|██▉                                                                                                                                                                                                | 84/5680 [13:12<12:29:26,  8.04s/it]  1%|██▉                                                                                                                                                                                                | 85/5680 [13:20<12:27:12,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.9508', 'grad_norm': '0.2271', 'learning_rate': '0.0001999', 'ppl': '2.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 696320, 'tokens/trainable': 693066, 'epoch': '0.01496'}
  1%|██▉                                                                                                                                                                                                | 85/5680 [13:20<12:27:12,  8.01s/it]  2%|██▉                                                                                                                                                                                                | 86/5680 [13:28<12:26:28,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7044', 'grad_norm': '0.213', 'learning_rate': '0.0001999', 'ppl': '2.023', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 704512, 'tokens/trainable': 701205, 'epoch': '0.01514'}
  2%|██▉                                                                                                                                                                                                | 86/5680 [13:28<12:26:28,  8.01s/it]  2%|██▉                                                                                                                                                                                                | 87/5680 [13:36<12:24:09,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.9324', 'grad_norm': '0.2319', 'learning_rate': '0.0001999', 'ppl': '2.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 712704, 'tokens/trainable': 709351, 'epoch': '0.01532'}
  2%|██▉                                                                                                                                                                                                | 87/5680 [13:36<12:24:09,  7.98s/it]  2%|███                                                                                                                                                                                                | 88/5680 [13:44<12:23:46,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6564', 'grad_norm': '0.2145', 'learning_rate': '0.0001999', 'ppl': '1.928', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 720896, 'tokens/trainable': 717516, 'epoch': '0.01549'}
  2%|███                                                                                                                                                                                                | 88/5680 [13:44<12:23:46,  7.98s/it]  2%|███                                                                                                                                                                                                | 89/5680 [13:52<12:23:55,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8241', 'grad_norm': '0.2303', 'learning_rate': '0.0001999', 'ppl': '2.28', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 729088, 'tokens/trainable': 725687, 'epoch': '0.01567'}
  2%|███                                                                                                                                                                                                | 89/5680 [13:52<12:23:55,  7.98s/it]  2%|███                                                                                                                                                                                                | 90/5680 [14:00<12:23:14,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '1.016', 'grad_norm': '0.2463', 'learning_rate': '0.0001999', 'ppl': '2.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 737280, 'tokens/trainable': 733829, 'epoch': '0.01585'}
  2%|███                                                                                                                                                                                                | 90/5680 [14:00<12:23:14,  7.98s/it]  2%|███                                                                                                                                                                                                | 91/5680 [14:08<12:32:02,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.617', 'grad_norm': '0.1998', 'learning_rate': '0.0001999', 'ppl': '1.853', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.3', 'tokens/total': 745472, 'tokens/trainable': 741994, 'epoch': '0.01602'}
  2%|███                                                                                                                                                                                                | 91/5680 [14:08<12:32:02,  8.07s/it]  2%|███▏                                                                                                                                                                                               | 92/5680 [14:16<12:30:21,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.6619', 'grad_norm': '0.1866', 'learning_rate': '0.0001999', 'ppl': '1.938', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 753664, 'tokens/trainable': 750166, 'epoch': '0.0162'}
  2%|███▏                                                                                                                                                                                               | 92/5680 [14:16<12:30:21,  8.06s/it]  2%|███▏                                                                                                                                                                                               | 93/5680 [14:24<12:32:30,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.7941', 'grad_norm': '0.2198', 'learning_rate': '0.0001999', 'ppl': '2.212', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 761856, 'tokens/trainable': 758322, 'epoch': '0.01637'}
  2%|███▏                                                                                                                                                                                               | 93/5680 [14:24<12:32:30,  8.08s/it]  2%|███▏                                                                                                                                                                                               | 94/5680 [14:32<12:34:22,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.772', 'grad_norm': '0.2185', 'learning_rate': '0.0001999', 'ppl': '2.164', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 770048, 'tokens/trainable': 766479, 'epoch': '0.01655'}
  2%|███▏                                                                                                                                                                                               | 94/5680 [14:32<12:34:22,  8.10s/it]  2%|███▎                                                                                                                                                                                               | 95/5680 [14:40<12:36:11,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.6976', 'grad_norm': '0.2169', 'learning_rate': '0.0001999', 'ppl': '2.009', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.9', 'tokens/total': 778240, 'tokens/trainable': 774637, 'epoch': '0.01673'}
  2%|███▎                                                                                                                                                                                               | 95/5680 [14:40<12:36:11,  8.12s/it]  2%|███▎                                                                                                                                                                                               | 96/5680 [14:49<12:36:55,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.8412', 'grad_norm': '0.2459', 'learning_rate': '0.0001999', 'ppl': '2.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.5', 'tokens/total': 786432, 'tokens/trainable': 782728, 'epoch': '0.0169'}
  2%|███▎                                                                                                                                                                                               | 96/5680 [14:49<12:36:55,  8.13s/it]  2%|███▎                                                                                                                                                                                               | 97/5680 [14:57<12:38:34,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.9901', 'grad_norm': '0.2525', 'learning_rate': '0.0001999', 'ppl': '2.692', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.5', 'tokens/total': 794624, 'tokens/trainable': 790899, 'epoch': '0.01708'}
  2%|███▎                                                                                                                                                                                               | 97/5680 [14:57<12:38:34,  8.15s/it]  2%|███▎                                                                                                                                                                                               | 98/5680 [15:05<12:39:11,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.7799', 'grad_norm': '0.2049', 'learning_rate': '0.0001999', 'ppl': '2.181', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 802816, 'tokens/trainable': 799083, 'epoch': '0.01725'}
  2%|███▎                                                                                                                                                                                               | 98/5680 [15:05<12:39:11,  8.16s/it]  2%|███▍                                                                                                                                                                                               | 99/5680 [15:13<12:35:33,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '1.04', 'grad_norm': '0.249', 'learning_rate': '0.0001999', 'ppl': '2.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 811008, 'tokens/trainable': 807261, 'epoch': '0.01743'}
  2%|███▍                                                                                                                                                                                               | 99/5680 [15:13<12:35:33,  8.12s/it]  2%|███▍                                                                                                                                                                                              | 100/5680 [15:21<12:31:36,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.9458', 'grad_norm': '0.2413', 'learning_rate': '0.0001999', 'ppl': '2.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 819200, 'tokens/trainable': 815433, 'epoch': '0.01761'}
  2%|███▍                                                                                                                                                                                              | 100/5680 [15:21<12:31:36,  8.08s/it]  2%|███▍                                                                                                                                                                                              | 101/5680 [15:29<12:28:42,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5096', 'grad_norm': '0.1702', 'learning_rate': '0.0001998', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 827392, 'tokens/trainable': 823611, 'epoch': '0.01778'}
  2%|███▍                                                                                                                                                                                              | 101/5680 [15:29<12:28:42,  8.05s/it]  2%|███▍                                                                                                                                                                                              | 102/5680 [15:37<12:27:03,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7501', 'grad_norm': '0.203', 'learning_rate': '0.0001998', 'ppl': '2.117', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 835584, 'tokens/trainable': 831776, 'epoch': '0.01796'}
  2%|███▍                                                                                                                                                                                              | 102/5680 [15:37<12:27:03,  8.04s/it]  2%|███▌                                                                                                                                                                                              | 103/5680 [15:45<12:24:57,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.658', 'grad_norm': '0.2126', 'learning_rate': '0.0001998', 'ppl': '1.931', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 843776, 'tokens/trainable': 839907, 'epoch': '0.01813'}
  2%|███▌                                                                                                                                                                                              | 103/5680 [15:45<12:24:57,  8.01s/it]  2%|███▌                                                                                                                                                                                              | 104/5680 [15:53<12:22:10,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.8446', 'grad_norm': '0.224', 'learning_rate': '0.0001998', 'ppl': '2.327', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 851968, 'tokens/trainable': 848004, 'epoch': '0.01831'}
  2%|███▌                                                                                                                                                                                              | 104/5680 [15:53<12:22:10,  7.99s/it]  2%|███▌                                                                                                                                                                                              | 105/5680 [16:01<12:21:54,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.9831', 'grad_norm': '0.264', 'learning_rate': '0.0001998', 'ppl': '2.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 860160, 'tokens/trainable': 856133, 'epoch': '0.01849'}
  2%|███▌                                                                                                                                                                                              | 105/5680 [16:01<12:21:54,  7.98s/it]  2%|███▌                                                                                                                                                                                              | 106/5680 [16:09<12:21:37,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8626', 'grad_norm': '0.2265', 'learning_rate': '0.0001998', 'ppl': '2.369', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 868352, 'tokens/trainable': 864282, 'epoch': '0.01866'}
  2%|███▌                                                                                                                                                                                              | 106/5680 [16:09<12:21:37,  7.98s/it]  2%|███▋                                                                                                                                                                                              | 107/5680 [16:17<12:20:28,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.9345', 'grad_norm': '0.2334', 'learning_rate': '0.0001998', 'ppl': '2.546', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 876544, 'tokens/trainable': 872450, 'epoch': '0.01884'}
  2%|███▋                                                                                                                                                                                              | 107/5680 [16:17<12:20:28,  7.97s/it]  2%|███▋                                                                                                                                                                                              | 108/5680 [16:25<12:20:32,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7926', 'grad_norm': '0.24', 'learning_rate': '0.0001998', 'ppl': '2.209', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 884736, 'tokens/trainable': 880635, 'epoch': '0.01901'}
  2%|███▋                                                                                                                                                                                              | 108/5680 [16:25<12:20:32,  7.97s/it]  2%|███▋                                                                                                                                                                                              | 109/5680 [16:33<12:21:22,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6997', 'grad_norm': '0.2176', 'learning_rate': '0.0001998', 'ppl': '2.013', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 892928, 'tokens/trainable': 888809, 'epoch': '0.01919'}
  2%|███▋                                                                                                                                                                                              | 109/5680 [16:33<12:21:22,  7.98s/it]  2%|███▊                                                                                                                                                                                              | 110/5680 [16:41<12:21:33,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6714', 'grad_norm': '0.2255', 'learning_rate': '0.0001998', 'ppl': '1.957', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 901120, 'tokens/trainable': 896922, 'epoch': '0.01937'}
  2%|███▊                                                                                                                                                                                              | 110/5680 [16:41<12:21:33,  7.99s/it]  2%|███▊                                                                                                                                                                                              | 111/5680 [16:49<12:22:20,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.7493', 'grad_norm': '0.2024', 'learning_rate': '0.0001998', 'ppl': '2.115', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 909312, 'tokens/trainable': 905019, 'epoch': '0.01954'}
  2%|███▊                                                                                                                                                                                              | 111/5680 [16:49<12:22:20,  8.00s/it]  2%|███▊                                                                                                                                                                                              | 112/5680 [16:57<12:24:27,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6804', 'grad_norm': '0.1958', 'learning_rate': '0.0001998', 'ppl': '1.975', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 917504, 'tokens/trainable': 913134, 'epoch': '0.01972'}
  2%|███▊                                                                                                                                                                                              | 112/5680 [16:57<12:24:27,  8.02s/it]  2%|███▊                                                                                                                                                                                              | 113/5680 [17:05<12:27:30,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.5997', 'grad_norm': '0.2105', 'learning_rate': '0.0001998', 'ppl': '1.821', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.1', 'tokens/total': 925696, 'tokens/trainable': 921261, 'epoch': '0.01989'}
  2%|███▊                                                                                                                                                                                              | 113/5680 [17:05<12:27:30,  8.06s/it]  2%|███▉                                                                                                                                                                                              | 114/5680 [17:13<12:27:27,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7381', 'grad_norm': '0.2072', 'learning_rate': '0.0001998', 'ppl': '2.092', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 933888, 'tokens/trainable': 929372, 'epoch': '0.02007'}
  2%|███▉                                                                                                                                                                                              | 114/5680 [17:13<12:27:27,  8.06s/it]  2%|███▉                                                                                                                                                                                              | 115/5680 [17:21<12:26:22,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.7426', 'grad_norm': '0.2009', 'learning_rate': '0.0001998', 'ppl': '2.101', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 942080, 'tokens/trainable': 937497, 'epoch': '0.02025'}
  2%|███▉                                                                                                                                                                                              | 115/5680 [17:21<12:26:22,  8.05s/it]  2%|███▉                                                                                                                                                                                              | 116/5680 [17:29<12:24:53,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.7696', 'grad_norm': '0.2134', 'learning_rate': '0.0001998', 'ppl': '2.159', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 950272, 'tokens/trainable': 945622, 'epoch': '0.02042'}
  2%|███▉                                                                                                                                                                                              | 116/5680 [17:29<12:24:53,  8.03s/it]  2%|███▉                                                                                                                                                                                              | 117/5680 [17:37<12:23:09,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.7223', 'grad_norm': '0.2037', 'learning_rate': '0.0001998', 'ppl': '2.059', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 958464, 'tokens/trainable': 953742, 'epoch': '0.0206'}
  2%|███▉                                                                                                                                                                                              | 117/5680 [17:37<12:23:09,  8.02s/it]  2%|████                                                                                                                                                                                              | 118/5680 [17:45<12:22:03,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6736', 'grad_norm': '0.2378', 'learning_rate': '0.0001998', 'ppl': '1.961', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 966656, 'tokens/trainable': 961902, 'epoch': '0.02077'}
  2%|████                                                                                                                                                                                              | 118/5680 [17:45<12:22:03,  8.01s/it]  2%|████                                                                                                                                                                                              | 119/5680 [17:53<12:19:39,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7432', 'grad_norm': '0.2207', 'learning_rate': '0.0001998', 'ppl': '2.103', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 974848, 'tokens/trainable': 970033, 'epoch': '0.02095'}
  2%|████                                                                                                                                                                                              | 119/5680 [17:53<12:19:39,  7.98s/it]  2%|████                                                                                                                                                                                              | 120/5680 [18:01<12:19:08,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5128', 'grad_norm': '0.181', 'learning_rate': '0.0001998', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 983040, 'tokens/trainable': 978197, 'epoch': '0.02113'}
  2%|████                                                                                                                                                                                              | 120/5680 [18:01<12:19:08,  7.98s/it]  2%|████▏                                                                                                                                                                                             | 121/5680 [18:09<12:19:34,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8631', 'grad_norm': '0.2394', 'learning_rate': '0.0001998', 'ppl': '2.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 991232, 'tokens/trainable': 986328, 'epoch': '0.0213'}
  2%|████▏                                                                                                                                                                                             | 121/5680 [18:09<12:19:34,  7.98s/it]  2%|████▏                                                                                                                                                                                             | 122/5680 [18:17<12:19:30,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5833', 'grad_norm': '0.1992', 'learning_rate': '0.0001998', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 999424, 'tokens/trainable': 994462, 'epoch': '0.02148'}
  2%|████▏                                                                                                                                                                                             | 122/5680 [18:17<12:19:30,  7.98s/it]  2%|████▏                                                                                                                                                                                             | 123/5680 [18:25<12:19:40,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.8893', 'grad_norm': '0.2637', 'learning_rate': '0.0001998', 'ppl': '2.433', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 1007616, 'tokens/trainable': 1002606, 'epoch': '0.02165'}
  2%|████▏                                                                                                                                                                                             | 123/5680 [18:25<12:19:40,  7.99s/it]  2%|████▏                                                                                                                                                                                             | 124/5680 [18:33<12:19:14,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8257', 'grad_norm': '0.2362', 'learning_rate': '0.0001998', 'ppl': '2.283', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 1015808, 'tokens/trainable': 1010755, 'epoch': '0.02183'}
  2%|████▏                                                                                                                                                                                             | 124/5680 [18:33<12:19:14,  7.98s/it]  2%|████▎                                                                                                                                                                                             | 125/5680 [18:41<12:19:40,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.8601', 'grad_norm': '0.247', 'learning_rate': '0.0001998', 'ppl': '2.364', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 1024000, 'tokens/trainable': 1018927, 'epoch': '0.02201'}
  2%|████▎                                                                                                                                                                                             | 125/5680 [18:41<12:19:40,  7.99s/it]  2%|████▎                                                                                                                                                                                             | 126/5680 [18:49<12:18:41,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6673', 'grad_norm': '0.2202', 'learning_rate': '0.0001998', 'ppl': '1.949', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 1032192, 'tokens/trainable': 1027055, 'epoch': '0.02218'}
  2%|████▎                                                                                                                                                                                             | 126/5680 [18:49<12:18:41,  7.98s/it]  2%|████▎                                                                                                                                                                                             | 127/5680 [18:57<12:18:44,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7545', 'grad_norm': '0.2225', 'learning_rate': '0.0001998', 'ppl': '2.127', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 1040384, 'tokens/trainable': 1035152, 'epoch': '0.02236'}
  2%|████▎                                                                                                                                                                                             | 127/5680 [18:57<12:18:44,  7.98s/it]  2%|████▎                                                                                                                                                                                             | 128/5680 [19:05<12:18:29,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8937', 'grad_norm': '0.2395', 'learning_rate': '0.0001998', 'ppl': '2.444', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 1048576, 'tokens/trainable': 1043318, 'epoch': '0.02254'}
  2%|████▎                                                                                                                                                                                             | 128/5680 [19:05<12:18:29,  7.98s/it]  2%|████▍                                                                                                                                                                                             | 129/5680 [19:13<12:19:45,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.9113', 'grad_norm': '0.2426', 'learning_rate': '0.0001997', 'ppl': '2.488', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 1056768, 'tokens/trainable': 1051482, 'epoch': '0.02271'}
  2%|████▍                                                                                                                                                                                             | 129/5680 [19:13<12:19:45,  8.00s/it]  2%|████▍                                                                                                                                                                                             | 130/5680 [19:21<12:19:28,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.9364', 'grad_norm': '0.2562', 'learning_rate': '0.0001997', 'ppl': '2.551', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 1064960, 'tokens/trainable': 1059631, 'epoch': '0.02289'}
  2%|████▍                                                                                                                                                                                             | 130/5680 [19:21<12:19:28,  7.99s/it]  2%|████▍                                                                                                                                                                                             | 131/5680 [19:29<12:18:47,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.8103', 'grad_norm': '0.2169', 'learning_rate': '0.0001997', 'ppl': '2.248', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 1073152, 'tokens/trainable': 1067773, 'epoch': '0.02306'}
  2%|████▍                                                                                                                                                                                             | 131/5680 [19:29<12:18:47,  7.99s/it]  2%|████▌                                                                                                                                                                                             | 132/5680 [19:37<12:18:01,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7755', 'grad_norm': '0.2331', 'learning_rate': '0.0001997', 'ppl': '2.172', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 1081344, 'tokens/trainable': 1075923, 'epoch': '0.02324'}
  2%|████▌                                                                                                                                                                                             | 132/5680 [19:37<12:18:01,  7.98s/it]  2%|████▌                                                                                                                                                                                             | 133/5680 [19:45<12:17:58,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.9575', 'grad_norm': '0.2602', 'learning_rate': '0.0001997', 'ppl': '2.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 1089536, 'tokens/trainable': 1084075, 'epoch': '0.02342'}
  2%|████▌                                                                                                                                                                                             | 133/5680 [19:45<12:17:58,  7.98s/it]  2%|████▌                                                                                                                                                                                             | 134/5680 [19:53<12:18:04,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7822', 'grad_norm': '0.2229', 'learning_rate': '0.0001997', 'ppl': '2.186', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 1097728, 'tokens/trainable': 1092258, 'epoch': '0.02359'}
  2%|████▌                                                                                                                                                                                             | 134/5680 [19:53<12:18:04,  7.99s/it]  2%|████▌                                                                                                                                                                                             | 135/5680 [20:01<12:17:45,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.9573', 'grad_norm': '0.2351', 'learning_rate': '0.0001997', 'ppl': '2.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 1105920, 'tokens/trainable': 1100438, 'epoch': '0.02377'}
  2%|████▌                                                                                                                                                                                             | 135/5680 [20:01<12:17:45,  7.98s/it]  2%|████▋                                                                                                                                                                                             | 136/5680 [20:09<12:16:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '1.036', 'grad_norm': '0.2339', 'learning_rate': '0.0001997', 'ppl': '2.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 1114112, 'tokens/trainable': 1108600, 'epoch': '0.02394'}
  2%|████▋                                                                                                                                                                                             | 136/5680 [20:09<12:16:45,  7.97s/it]  2%|████▋                                                                                                                                                                                             | 137/5680 [20:17<12:16:56,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6108', 'grad_norm': '0.1828', 'learning_rate': '0.0001997', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 1122304, 'tokens/trainable': 1116758, 'epoch': '0.02412'}
  2%|████▋                                                                                                                                                                                             | 137/5680 [20:17<12:16:56,  7.98s/it]  2%|████▋                                                                                                                                                                                             | 138/5680 [20:25<12:26:11,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.8068', 'grad_norm': '0.2058', 'learning_rate': '0.0001997', 'ppl': '2.241', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.4', 'tokens/total': 1130496, 'tokens/trainable': 1124867, 'epoch': '0.0243'}
  2%|████▋                                                                                                                                                                                             | 138/5680 [20:25<12:26:11,  8.08s/it]  2%|████▋                                                                                                                                                                                             | 139/5680 [20:33<12:24:15,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.4905', 'grad_norm': '0.181', 'learning_rate': '0.0001997', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 1138688, 'tokens/trainable': 1133048, 'epoch': '0.02447'}
  2%|████▋                                                                                                                                                                                             | 139/5680 [20:33<12:24:15,  8.06s/it]  2%|████▊                                                                                                                                                                                             | 140/5680 [20:41<12:22:53,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '1.011', 'grad_norm': '0.2869', 'learning_rate': '0.0001997', 'ppl': '2.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 1146880, 'tokens/trainable': 1141194, 'epoch': '0.02465'}
  2%|████▊                                                                                                                                                                                             | 140/5680 [20:41<12:22:53,  8.05s/it]  2%|████▊                                                                                                                                                                                             | 141/5680 [20:49<12:21:48,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7084', 'grad_norm': '0.2217', 'learning_rate': '0.0001997', 'ppl': '2.031', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 1155072, 'tokens/trainable': 1149314, 'epoch': '0.02482'}
  2%|████▊                                                                                                                                                                                             | 141/5680 [20:49<12:21:48,  8.04s/it]  2%|████▊                                                                                                                                                                                             | 142/5680 [20:57<12:19:49,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.9498', 'grad_norm': '0.233', 'learning_rate': '0.0001997', 'ppl': '2.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 1163264, 'tokens/trainable': 1157482, 'epoch': '0.025'}
  2%|████▊                                                                                                                                                                                             | 142/5680 [20:57<12:19:49,  8.02s/it]  3%|████▉                                                                                                                                                                                             | 143/5680 [21:05<12:18:59,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5456', 'grad_norm': '0.1994', 'learning_rate': '0.0001997', 'ppl': '1.726', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 1171456, 'tokens/trainable': 1165620, 'epoch': '0.02518'}
  3%|████▉                                                                                                                                                                                             | 143/5680 [21:05<12:18:59,  8.01s/it]  3%|████▉                                                                                                                                                                                             | 144/5680 [21:13<12:17:24,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7322', 'grad_norm': '0.2289', 'learning_rate': '0.0001997', 'ppl': '2.08', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 1179648, 'tokens/trainable': 1173773, 'epoch': '0.02535'}
  3%|████▉                                                                                                                                                                                             | 144/5680 [21:13<12:17:24,  7.99s/it]  3%|████▉                                                                                                                                                                                             | 145/5680 [21:21<12:18:40,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.9603', 'grad_norm': '0.2347', 'learning_rate': '0.0001997', 'ppl': '2.612', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 1187840, 'tokens/trainable': 1181910, 'epoch': '0.02553'}
  3%|████▉                                                                                                                                                                                             | 145/5680 [21:21<12:18:40,  8.01s/it]  3%|████▉                                                                                                                                                                                             | 146/5680 [21:30<12:45:08,  8.30s/it]                                                                                                                                                                                                                                             {'loss': '0.7093', 'grad_norm': '0.2192', 'learning_rate': '0.0001997', 'ppl': '2.033', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '903.2', 'tokens/total': 1196032, 'tokens/trainable': 1190009, 'epoch': '0.0257'}
  3%|████▉                                                                                                                                                                                             | 146/5680 [21:30<12:45:08,  8.30s/it]  3%|█████                                                                                                                                                                                             | 147/5680 [21:38<12:35:50,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.6469', 'grad_norm': '0.2133', 'learning_rate': '0.0001997', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 1204224, 'tokens/trainable': 1198158, 'epoch': '0.02588'}
  3%|█████                                                                                                                                                                                             | 147/5680 [21:38<12:35:50,  8.20s/it]  3%|█████                                                                                                                                                                                             | 148/5680 [21:46<12:30:29,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.5829', 'grad_norm': '0.2037', 'learning_rate': '0.0001997', 'ppl': '1.791', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 1212416, 'tokens/trainable': 1206288, 'epoch': '0.02606'}
  3%|█████                                                                                                                                                                                             | 148/5680 [21:46<12:30:29,  8.14s/it]  3%|█████                                                                                                                                                                                             | 149/5680 [21:54<12:26:12,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.8303', 'grad_norm': '0.2249', 'learning_rate': '0.0001997', 'ppl': '2.294', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 1220608, 'tokens/trainable': 1214443, 'epoch': '0.02623'}
  3%|█████                                                                                                                                                                                             | 149/5680 [21:54<12:26:12,  8.09s/it]  3%|█████                                                                                                                                                                                             | 150/5680 [22:02<12:22:52,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7816', 'grad_norm': '0.251', 'learning_rate': '0.0001997', 'ppl': '2.185', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 1228800, 'tokens/trainable': 1222555, 'epoch': '0.02641'}
  3%|█████                                                                                                                                                                                             | 150/5680 [22:02<12:22:52,  8.06s/it]  3%|█████▏                                                                                                                                                                                            | 151/5680 [22:10<12:20:10,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.6204', 'grad_norm': '0.2173', 'learning_rate': '0.0001997', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 1236992, 'tokens/trainable': 1230725, 'epoch': '0.02658'}
  3%|█████▏                                                                                                                                                                                            | 151/5680 [22:10<12:20:10,  8.03s/it]  3%|█████▏                                                                                                                                                                                            | 152/5680 [22:18<12:19:15,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.8902', 'grad_norm': '0.2389', 'learning_rate': '0.0001997', 'ppl': '2.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 1245184, 'tokens/trainable': 1238885, 'epoch': '0.02676'}
  3%|█████▏                                                                                                                                                                                            | 152/5680 [22:18<12:19:15,  8.02s/it]  3%|█████▏                                                                                                                                                                                            | 153/5680 [22:26<12:18:44,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.9366', 'grad_norm': '0.2332', 'learning_rate': '0.0001996', 'ppl': '2.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 1253376, 'tokens/trainable': 1247000, 'epoch': '0.02694'}
  3%|█████▏                                                                                                                                                                                            | 153/5680 [22:26<12:18:44,  8.02s/it]  3%|█████▎                                                                                                                                                                                            | 154/5680 [22:34<12:29:22,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.8095', 'grad_norm': '0.2356', 'learning_rate': '0.0001996', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971', 'tokens/total': 1261568, 'tokens/trainable': 1255162, 'epoch': '0.02711'}
  3%|█████▎                                                                                                                                                                                            | 154/5680 [22:34<12:29:22,  8.14s/it]  3%|█████▎                                                                                                                                                                                            | 155/5680 [22:42<12:24:42,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.6129', 'grad_norm': '0.2192', 'learning_rate': '0.0001996', 'ppl': '1.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 1269760, 'tokens/trainable': 1263342, 'epoch': '0.02729'}
  3%|█████▎                                                                                                                                                                                            | 155/5680 [22:42<12:24:42,  8.09s/it]  3%|█████▎                                                                                                                                                                                            | 156/5680 [22:50<12:22:05,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9166', 'grad_norm': '0.2518', 'learning_rate': '0.0001996', 'ppl': '2.501', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 1277952, 'tokens/trainable': 1271521, 'epoch': '0.02746'}
  3%|█████▎                                                                                                                                                                                            | 156/5680 [22:50<12:22:05,  8.06s/it]  3%|█████▎                                                                                                                                                                                            | 157/5680 [22:58<12:19:53,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7266', 'grad_norm': '0.2183', 'learning_rate': '0.0001996', 'ppl': '2.068', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 1286144, 'tokens/trainable': 1279705, 'epoch': '0.02764'}
  3%|█████▎                                                                                                                                                                                            | 157/5680 [22:58<12:19:53,  8.04s/it]  3%|█████▍                                                                                                                                                                                            | 158/5680 [23:06<12:18:36,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.649', 'grad_norm': '0.2089', 'learning_rate': '0.0001996', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 1294336, 'tokens/trainable': 1287860, 'epoch': '0.02782'}
  3%|█████▍                                                                                                                                                                                            | 158/5680 [23:06<12:18:36,  8.03s/it]  3%|█████▍                                                                                                                                                                                            | 159/5680 [23:14<12:15:51,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '1.068', 'grad_norm': '0.251', 'learning_rate': '0.0001996', 'ppl': '2.911', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 1302528, 'tokens/trainable': 1296001, 'epoch': '0.02799'}
  3%|█████▍                                                                                                                                                                                            | 159/5680 [23:14<12:15:51,  8.00s/it]  3%|█████▍                                                                                                                                                                                            | 160/5680 [23:22<12:15:58,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6519', 'grad_norm': '0.2408', 'learning_rate': '0.0001996', 'ppl': '1.919', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 1310720, 'tokens/trainable': 1304080, 'epoch': '0.02817'}
  3%|█████▍                                                                                                                                                                                            | 160/5680 [23:22<12:15:58,  8.00s/it]  3%|█████▍                                                                                                                                                                                            | 161/5680 [23:30<12:16:14,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.8827', 'grad_norm': '0.2431', 'learning_rate': '0.0001996', 'ppl': '2.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 1318912, 'tokens/trainable': 1312220, 'epoch': '0.02835'}
  3%|█████▍                                                                                                                                                                                            | 161/5680 [23:30<12:16:14,  8.00s/it]  3%|█████▌                                                                                                                                                                                            | 162/5680 [23:39<12:31:39,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.9581', 'grad_norm': '0.2336', 'learning_rate': '0.0001996', 'ppl': '2.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '953.7', 'tokens/total': 1327104, 'tokens/trainable': 1320389, 'epoch': '0.02852'}
  3%|█████▌                                                                                                                                                                                            | 162/5680 [23:39<12:31:39,  8.17s/it]  3%|█████▌                                                                                                                                                                                            | 163/5680 [23:47<12:25:16,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.7079', 'grad_norm': '0.2103', 'learning_rate': '0.0001996', 'ppl': '2.03', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 1335296, 'tokens/trainable': 1328506, 'epoch': '0.0287'}
  3%|█████▌                                                                                                                                                                                            | 163/5680 [23:47<12:25:16,  8.11s/it]  3%|█████▌                                                                                                                                                                                            | 164/5680 [23:55<12:22:45,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.6474', 'grad_norm': '0.2916', 'learning_rate': '0.0001996', 'ppl': '1.911', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 1343488, 'tokens/trainable': 1336678, 'epoch': '0.02887'}
  3%|█████▌                                                                                                                                                                                            | 164/5680 [23:55<12:22:45,  8.08s/it]  3%|█████▋                                                                                                                                                                                            | 165/5680 [24:03<12:19:55,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.9239', 'grad_norm': '0.2391', 'learning_rate': '0.0001996', 'ppl': '2.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 1351680, 'tokens/trainable': 1344859, 'epoch': '0.02905'}
  3%|█████▋                                                                                                                                                                                            | 165/5680 [24:03<12:19:55,  8.05s/it]  3%|█████▋                                                                                                                                                                                            | 166/5680 [24:11<12:19:13,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.6', 'grad_norm': '0.2266', 'learning_rate': '0.0001996', 'ppl': '1.822', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 1359872, 'tokens/trainable': 1352969, 'epoch': '0.02923'}
  3%|█████▋                                                                                                                                                                                            | 166/5680 [24:11<12:19:13,  8.04s/it]  3%|█████▋                                                                                                                                                                                            | 167/5680 [24:19<12:17:13,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6799', 'grad_norm': '0.2245', 'learning_rate': '0.0001996', 'ppl': '1.974', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 1368064, 'tokens/trainable': 1361158, 'epoch': '0.0294'}
  3%|█████▋                                                                                                                                                                                            | 167/5680 [24:19<12:17:13,  8.02s/it]  3%|█████▋                                                                                                                                                                                            | 168/5680 [24:27<12:16:41,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6615', 'grad_norm': '0.2305', 'learning_rate': '0.0001996', 'ppl': '1.938', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 1376256, 'tokens/trainable': 1369302, 'epoch': '0.02958'}
  3%|█████▋                                                                                                                                                                                            | 168/5680 [24:27<12:16:41,  8.02s/it]  3%|█████▊                                                                                                                                                                                            | 169/5680 [24:35<12:17:21,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.9992', 'grad_norm': '0.2492', 'learning_rate': '0.0001996', 'ppl': '2.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 1384448, 'tokens/trainable': 1377414, 'epoch': '0.02975'}
  3%|█████▊                                                                                                                                                                                            | 169/5680 [24:35<12:17:21,  8.03s/it]  3%|█████▊                                                                                                                                                                                            | 170/5680 [24:44<12:38:23,  8.26s/it]                                                                                                                                                                                                                                             {'loss': '0.7226', 'grad_norm': '0.2171', 'learning_rate': '0.0001996', 'ppl': '2.06', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '922.8', 'tokens/total': 1392640, 'tokens/trainable': 1385529, 'epoch': '0.02993'}
  3%|█████▊                                                                                                                                                                                            | 170/5680 [24:44<12:38:23,  8.26s/it]  3%|█████▊                                                                                                                                                                                            | 171/5680 [24:52<12:36:01,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.7497', 'grad_norm': '0.2516', 'learning_rate': '0.0001996', 'ppl': '2.116', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 1400832, 'tokens/trainable': 1393721, 'epoch': '0.03011'}
  3%|█████▊                                                                                                                                                                                            | 171/5680 [24:52<12:36:01,  8.23s/it]  3%|█████▊                                                                                                                                                                                            | 172/5680 [25:00<12:34:20,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.7276', 'grad_norm': '0.2059', 'learning_rate': '0.0001996', 'ppl': '2.07', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 1409024, 'tokens/trainable': 1401907, 'epoch': '0.03028'}
  3%|█████▊                                                                                                                                                                                            | 172/5680 [25:00<12:34:20,  8.22s/it]  3%|█████▉                                                                                                                                                                                            | 173/5680 [25:08<12:32:27,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.9348', 'grad_norm': '0.2336', 'learning_rate': '0.0001995', 'ppl': '2.547', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 1417216, 'tokens/trainable': 1410092, 'epoch': '0.03046'}
  3%|█████▉                                                                                                                                                                                            | 173/5680 [25:08<12:32:27,  8.20s/it]  3%|█████▉                                                                                                                                                                                            | 174/5680 [25:16<12:30:15,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.8242', 'grad_norm': '0.2166', 'learning_rate': '0.0001995', 'ppl': '2.28', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 1425408, 'tokens/trainable': 1418275, 'epoch': '0.03063'}
  3%|█████▉                                                                                                                                                                                            | 174/5680 [25:16<12:30:15,  8.18s/it]  3%|█████▉                                                                                                                                                                                            | 175/5680 [25:24<12:31:17,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.7283', 'grad_norm': '0.2207', 'learning_rate': '0.0001995', 'ppl': '2.072', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.8', 'tokens/total': 1433600, 'tokens/trainable': 1426432, 'epoch': '0.03081'}
  3%|█████▉                                                                                                                                                                                            | 175/5680 [25:24<12:31:17,  8.19s/it]  3%|██████                                                                                                                                                                                            | 176/5680 [25:33<12:31:30,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '1.013', 'grad_norm': '0.2385', 'learning_rate': '0.0001995', 'ppl': '2.755', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.3', 'tokens/total': 1441792, 'tokens/trainable': 1434601, 'epoch': '0.03099'}
  3%|██████                                                                                                                                                                                            | 176/5680 [25:33<12:31:30,  8.19s/it]  3%|██████                                                                                                                                                                                            | 177/5680 [25:41<12:31:18,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.92', 'grad_norm': '0.2544', 'learning_rate': '0.0001995', 'ppl': '2.509', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.2', 'tokens/total': 1449984, 'tokens/trainable': 1442749, 'epoch': '0.03116'}
  3%|██████                                                                                                                                                                                            | 177/5680 [25:41<12:31:18,  8.19s/it]  3%|██████                                                                                                                                                                                            | 178/5680 [25:49<12:35:32,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.7072', 'grad_norm': '0.2266', 'learning_rate': '0.0001995', 'ppl': '2.028', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.7', 'tokens/total': 1458176, 'tokens/trainable': 1450911, 'epoch': '0.03134'}
  3%|██████                                                                                                                                                                                            | 178/5680 [25:49<12:35:32,  8.24s/it]  3%|██████                                                                                                                                                                                            | 179/5680 [25:57<12:32:53,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.816', 'grad_norm': '0.2612', 'learning_rate': '0.0001995', 'ppl': '2.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.8', 'tokens/total': 1466368, 'tokens/trainable': 1459038, 'epoch': '0.03151'}
  3%|██████                                                                                                                                                                                            | 179/5680 [25:57<12:32:53,  8.21s/it]  3%|██████▏                                                                                                                                                                                           | 180/5680 [26:05<12:28:40,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8445', 'grad_norm': '0.2339', 'learning_rate': '0.0001995', 'ppl': '2.327', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 1474560, 'tokens/trainable': 1467207, 'epoch': '0.03169'}
  3%|██████▏                                                                                                                                                                                           | 180/5680 [26:05<12:28:40,  8.17s/it]  3%|██████▏                                                                                                                                                                                           | 181/5680 [26:13<12:27:15,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.7304', 'grad_norm': '0.2296', 'learning_rate': '0.0001995', 'ppl': '2.076', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 1482752, 'tokens/trainable': 1475370, 'epoch': '0.03187'}
  3%|██████▏                                                                                                                                                                                           | 181/5680 [26:13<12:27:15,  8.15s/it]  3%|██████▏                                                                                                                                                                                           | 182/5680 [26:22<12:26:46,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.6016', 'grad_norm': '0.1928', 'learning_rate': '0.0001995', 'ppl': '1.825', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 1490944, 'tokens/trainable': 1483534, 'epoch': '0.03204'}
  3%|██████▏                                                                                                                                                                                           | 182/5680 [26:22<12:26:46,  8.15s/it]  3%|██████▎                                                                                                                                                                                           | 183/5680 [26:30<12:29:07,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.841', 'grad_norm': '0.2954', 'learning_rate': '0.0001995', 'ppl': '2.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.3', 'tokens/total': 1499136, 'tokens/trainable': 1491667, 'epoch': '0.03222'}
  3%|██████▎                                                                                                                                                                                           | 183/5680 [26:30<12:29:07,  8.18s/it]  3%|██████▎                                                                                                                                                                                           | 184/5680 [26:38<12:31:07,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.6826', 'grad_norm': '0.2179', 'learning_rate': '0.0001995', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.2', 'tokens/total': 1507328, 'tokens/trainable': 1499804, 'epoch': '0.03239'}
  3%|██████▎                                                                                                                                                                                           | 184/5680 [26:38<12:31:07,  8.20s/it]  3%|██████▎                                                                                                                                                                                           | 185/5680 [26:46<12:26:52,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.6078', 'grad_norm': '0.2628', 'learning_rate': '0.0001995', 'ppl': '1.836', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 1515520, 'tokens/trainable': 1507922, 'epoch': '0.03257'}
  3%|██████▎                                                                                                                                                                                           | 185/5680 [26:46<12:26:52,  8.16s/it]  3%|██████▎                                                                                                                                                                                           | 186/5680 [26:54<12:24:59,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.6841', 'grad_norm': '0.2021', 'learning_rate': '0.0001995', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 1523712, 'tokens/trainable': 1516071, 'epoch': '0.03275'}
  3%|██████▎                                                                                                                                                                                           | 186/5680 [26:54<12:24:59,  8.14s/it]  3%|██████▍                                                                                                                                                                                           | 187/5680 [27:02<12:25:14,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.873', 'grad_norm': '0.2392', 'learning_rate': '0.0001995', 'ppl': '2.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 1531904, 'tokens/trainable': 1524227, 'epoch': '0.03292'}
  3%|██████▍                                                                                                                                                                                           | 187/5680 [27:02<12:25:14,  8.14s/it]  3%|██████▍                                                                                                                                                                                           | 188/5680 [27:11<12:26:03,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '1.075', 'grad_norm': '0.2459', 'learning_rate': '0.0001995', 'ppl': '2.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 1540096, 'tokens/trainable': 1532406, 'epoch': '0.0331'}
  3%|██████▍                                                                                                                                                                                           | 188/5680 [27:11<12:26:03,  8.15s/it]  3%|██████▍                                                                                                                                                                                           | 189/5680 [27:19<12:26:27,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.9336', 'grad_norm': '0.2257', 'learning_rate': '0.0001995', 'ppl': '2.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.5', 'tokens/total': 1548288, 'tokens/trainable': 1540550, 'epoch': '0.03327'}
  3%|██████▍                                                                                                                                                                                           | 189/5680 [27:19<12:26:27,  8.16s/it]  3%|██████▍                                                                                                                                                                                           | 190/5680 [27:27<12:26:36,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '1.007', 'grad_norm': '0.2413', 'learning_rate': '0.0001995', 'ppl': '2.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.5', 'tokens/total': 1556480, 'tokens/trainable': 1548686, 'epoch': '0.03345'}
  3%|██████▍                                                                                                                                                                                           | 190/5680 [27:27<12:26:36,  8.16s/it]  3%|██████▌                                                                                                                                                                                           | 191/5680 [27:35<12:25:57,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.9459', 'grad_norm': '0.2738', 'learning_rate': '0.0001994', 'ppl': '2.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 1564672, 'tokens/trainable': 1556839, 'epoch': '0.03363'}
  3%|██████▌                                                                                                                                                                                           | 191/5680 [27:35<12:25:57,  8.15s/it]  3%|██████▌                                                                                                                                                                                           | 192/5680 [27:43<12:30:19,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.581', 'grad_norm': '0.188', 'learning_rate': '0.0001994', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.2', 'tokens/total': 1572864, 'tokens/trainable': 1564997, 'epoch': '0.0338'}
  3%|██████▌                                                                                                                                                                                           | 192/5680 [27:43<12:30:19,  8.20s/it]  3%|██████▌                                                                                                                                                                                           | 193/5680 [27:52<12:32:35,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.8579', 'grad_norm': '0.2362', 'learning_rate': '0.0001994', 'ppl': '2.358', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.8', 'tokens/total': 1581056, 'tokens/trainable': 1573183, 'epoch': '0.03398'}
  3%|██████▌                                                                                                                                                                                           | 193/5680 [27:52<12:32:35,  8.23s/it]  3%|██████▋                                                                                                                                                                                           | 194/5680 [28:00<12:26:51,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8187', 'grad_norm': '0.2075', 'learning_rate': '0.0001994', 'ppl': '2.268', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 1589248, 'tokens/trainable': 1581352, 'epoch': '0.03415'}
  3%|██████▋                                                                                                                                                                                           | 194/5680 [28:00<12:26:51,  8.17s/it]  3%|██████▋                                                                                                                                                                                           | 195/5680 [28:08<12:22:50,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.6314', 'grad_norm': '0.1985', 'learning_rate': '0.0001994', 'ppl': '1.88', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 1597440, 'tokens/trainable': 1589527, 'epoch': '0.03433'}
  3%|██████▋                                                                                                                                                                                           | 195/5680 [28:08<12:22:50,  8.13s/it]  3%|██████▋                                                                                                                                                                                           | 196/5680 [28:16<12:25:12,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.8', 'grad_norm': '0.236', 'learning_rate': '0.0001994', 'ppl': '2.226', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.2', 'tokens/total': 1605632, 'tokens/trainable': 1597694, 'epoch': '0.03451'}
  3%|██████▋                                                                                                                                                                                           | 196/5680 [28:16<12:25:12,  8.15s/it]  3%|██████▋                                                                                                                                                                                           | 197/5680 [28:24<12:24:09,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.7481', 'grad_norm': '0.2037', 'learning_rate': '0.0001994', 'ppl': '2.113', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 1613824, 'tokens/trainable': 1605841, 'epoch': '0.03468'}
  3%|██████▋                                                                                                                                                                                           | 197/5680 [28:24<12:24:09,  8.14s/it]  3%|██████▊                                                                                                                                                                                           | 198/5680 [28:32<12:24:40,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.8108', 'grad_norm': '0.2251', 'learning_rate': '0.0001994', 'ppl': '2.25', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997', 'tokens/total': 1622016, 'tokens/trainable': 1613981, 'epoch': '0.03486'}
  3%|██████▊                                                                                                                                                                                           | 198/5680 [28:32<12:24:40,  8.15s/it]  4%|██████▊                                                                                                                                                                                           | 199/5680 [28:40<12:24:05,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.7389', 'grad_norm': '0.2095', 'learning_rate': '0.0001994', 'ppl': '2.094', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.3', 'tokens/total': 1630208, 'tokens/trainable': 1622099, 'epoch': '0.03504'}
  4%|██████▊                                                                                                                                                                                           | 199/5680 [28:40<12:24:05,  8.15s/it]  4%|██████▊                                                                                                                                                                                           | 200/5680 [28:49<12:26:51,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '1.026', 'grad_norm': '0.2625', 'learning_rate': '0.0001994', 'ppl': '2.789', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.8', 'tokens/total': 1638400, 'tokens/trainable': 1630271, 'epoch': '0.03521'}
  4%|██████▊                                                                                                                                                                                           | 200/5680 [28:49<12:26:51,  8.18s/it]  4%|██████▊                                                                                                                                                                                           | 201/5680 [28:57<12:26:24,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8518', 'grad_norm': '0.2214', 'learning_rate': '0.0001994', 'ppl': '2.344', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.3', 'tokens/total': 1646592, 'tokens/trainable': 1638429, 'epoch': '0.03539'}
  4%|██████▊                                                                                                                                                                                           | 201/5680 [28:57<12:26:24,  8.17s/it]  4%|██████▉                                                                                                                                                                                           | 202/5680 [29:05<12:24:59,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.928', 'grad_norm': '0.2387', 'learning_rate': '0.0001994', 'ppl': '2.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 1654784, 'tokens/trainable': 1646590, 'epoch': '0.03556'}
  4%|██████▉                                                                                                                                                                                           | 202/5680 [29:05<12:24:59,  8.16s/it]  4%|██████▉                                                                                                                                                                                           | 203/5680 [29:13<12:21:51,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.8252', 'grad_norm': '0.2819', 'learning_rate': '0.0001994', 'ppl': '2.282', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 1662976, 'tokens/trainable': 1654745, 'epoch': '0.03574'}
  4%|██████▉                                                                                                                                                                                           | 203/5680 [29:13<12:21:51,  8.13s/it]  4%|██████▉                                                                                                                                                                                           | 204/5680 [29:21<12:24:41,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.9163', 'grad_norm': '0.2283', 'learning_rate': '0.0001994', 'ppl': '2.5', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.3', 'tokens/total': 1671168, 'tokens/trainable': 1662882, 'epoch': '0.03592'}
  4%|██████▉                                                                                                                                                                                           | 204/5680 [29:21<12:24:41,  8.16s/it]  4%|███████                                                                                                                                                                                           | 205/5680 [29:30<12:34:12,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.8468', 'grad_norm': '0.2094', 'learning_rate': '0.0001994', 'ppl': '2.332', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '951.7', 'tokens/total': 1679360, 'tokens/trainable': 1670981, 'epoch': '0.03609'}
  4%|███████                                                                                                                                                                                           | 205/5680 [29:30<12:34:12,  8.27s/it]  4%|███████                                                                                                                                                                                           | 206/5680 [29:38<12:30:23,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.8149', 'grad_norm': '0.2313', 'learning_rate': '0.0001994', 'ppl': '2.259', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 1687552, 'tokens/trainable': 1679128, 'epoch': '0.03627'}
  4%|███████                                                                                                                                                                                           | 206/5680 [29:38<12:30:23,  8.23s/it]  4%|███████                                                                                                                                                                                           | 207/5680 [29:46<12:25:44,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '1.02', 'grad_norm': '0.2596', 'learning_rate': '0.0001994', 'ppl': '2.774', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 1695744, 'tokens/trainable': 1687221, 'epoch': '0.03644'}
  4%|███████                                                                                                                                                                                           | 207/5680 [29:46<12:25:44,  8.18s/it]  4%|███████                                                                                                                                                                                           | 208/5680 [29:54<12:25:55,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.6959', 'grad_norm': '0.226', 'learning_rate': '0.0001993', 'ppl': '2.006', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.9', 'tokens/total': 1703936, 'tokens/trainable': 1695312, 'epoch': '0.03662'}
  4%|███████                                                                                                                                                                                           | 208/5680 [29:54<12:25:55,  8.18s/it]  4%|███████▏                                                                                                                                                                                          | 209/5680 [30:02<12:23:09,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.806', 'grad_norm': '0.2109', 'learning_rate': '0.0001993', 'ppl': '2.239', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 1712128, 'tokens/trainable': 1703396, 'epoch': '0.0368'}
  4%|███████▏                                                                                                                                                                                          | 209/5680 [30:02<12:23:09,  8.15s/it]  4%|███████▏                                                                                                                                                                                          | 210/5680 [30:10<12:22:02,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.6782', 'grad_norm': '0.205', 'learning_rate': '0.0001993', 'ppl': '1.97', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 1720320, 'tokens/trainable': 1711535, 'epoch': '0.03697'}
  4%|███████▏                                                                                                                                                                                          | 210/5680 [30:10<12:22:02,  8.14s/it]  4%|███████▏                                                                                                                                                                                          | 211/5680 [30:18<12:22:46,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.7067', 'grad_norm': '0.2068', 'learning_rate': '0.0001993', 'ppl': '2.027', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.6', 'tokens/total': 1728512, 'tokens/trainable': 1719660, 'epoch': '0.03715'}
  4%|███████▏                                                                                                                                                                                          | 211/5680 [30:18<12:22:46,  8.15s/it]  4%|███████▏                                                                                                                                                                                          | 212/5680 [30:27<12:22:21,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.6473', 'grad_norm': '0.1967', 'learning_rate': '0.0001993', 'ppl': '1.91', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997', 'tokens/total': 1736704, 'tokens/trainable': 1727772, 'epoch': '0.03732'}
  4%|███████▏                                                                                                                                                                                          | 212/5680 [30:27<12:22:21,  8.15s/it]  4%|███████▎                                                                                                                                                                                          | 213/5680 [30:35<12:23:15,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '1.006', 'grad_norm': '0.2636', 'learning_rate': '0.0001993', 'ppl': '2.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 1744896, 'tokens/trainable': 1735960, 'epoch': '0.0375'}
  4%|███████▎                                                                                                                                                                                          | 213/5680 [30:35<12:23:15,  8.16s/it]  4%|███████▎                                                                                                                                                                                          | 214/5680 [30:43<12:21:42,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.6097', 'grad_norm': '0.2033', 'learning_rate': '0.0001993', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 1753088, 'tokens/trainable': 1744078, 'epoch': '0.03768'}
  4%|███████▎                                                                                                                                                                                          | 214/5680 [30:43<12:21:42,  8.14s/it]  4%|███████▎                                                                                                                                                                                          | 215/5680 [30:51<12:21:50,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.7797', 'grad_norm': '0.2217', 'learning_rate': '0.0001993', 'ppl': '2.181', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 1761280, 'tokens/trainable': 1752205, 'epoch': '0.03785'}
  4%|███████▎                                                                                                                                                                                          | 215/5680 [30:51<12:21:50,  8.14s/it]  4%|███████▍                                                                                                                                                                                          | 216/5680 [30:59<12:22:51,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.5676', 'grad_norm': '0.2086', 'learning_rate': '0.0001993', 'ppl': '1.764', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.2', 'tokens/total': 1769472, 'tokens/trainable': 1760348, 'epoch': '0.03803'}
  4%|███████▍                                                                                                                                                                                          | 216/5680 [30:59<12:22:51,  8.16s/it]  4%|███████▍                                                                                                                                                                                          | 217/5680 [31:07<12:23:47,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.5464', 'grad_norm': '0.1878', 'learning_rate': '0.0001993', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.9', 'tokens/total': 1777664, 'tokens/trainable': 1768484, 'epoch': '0.0382'}
  4%|███████▍                                                                                                                                                                                          | 217/5680 [31:07<12:23:47,  8.17s/it]  4%|███████▍                                                                                                                                                                                          | 218/5680 [31:16<12:24:00,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '1.172', 'grad_norm': '0.2597', 'learning_rate': '0.0001993', 'ppl': '3.23', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.1', 'tokens/total': 1785856, 'tokens/trainable': 1776631, 'epoch': '0.03838'}
  4%|███████▍                                                                                                                                                                                          | 218/5680 [31:16<12:24:00,  8.17s/it]  4%|███████▍                                                                                                                                                                                          | 219/5680 [31:24<12:23:30,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.6886', 'grad_norm': '0.221', 'learning_rate': '0.0001993', 'ppl': '1.991', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.2', 'tokens/total': 1794048, 'tokens/trainable': 1784761, 'epoch': '0.03856'}
  4%|███████▍                                                                                                                                                                                          | 219/5680 [31:24<12:23:30,  8.17s/it]  4%|███████▌                                                                                                                                                                                          | 220/5680 [31:32<12:23:15,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7906', 'grad_norm': '0.2292', 'learning_rate': '0.0001993', 'ppl': '2.205', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.4', 'tokens/total': 1802240, 'tokens/trainable': 1792909, 'epoch': '0.03873'}
  4%|███████▌                                                                                                                                                                                          | 220/5680 [31:32<12:23:15,  8.17s/it]  4%|███████▌                                                                                                                                                                                          | 221/5680 [31:40<12:22:11,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.8556', 'grad_norm': '0.27', 'learning_rate': '0.0001993', 'ppl': '2.353', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 1810432, 'tokens/trainable': 1801091, 'epoch': '0.03891'}
  4%|███████▌                                                                                                                                                                                          | 221/5680 [31:40<12:22:11,  8.16s/it]  4%|███████▌                                                                                                                                                                                          | 222/5680 [31:48<12:22:05,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.5594', 'grad_norm': '0.2011', 'learning_rate': '0.0001993', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 1818624, 'tokens/trainable': 1809268, 'epoch': '0.03908'}
  4%|███████▌                                                                                                                                                                                          | 222/5680 [31:48<12:22:05,  8.16s/it]  4%|███████▌                                                                                                                                                                                          | 223/5680 [31:56<12:21:29,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '1.087', 'grad_norm': '0.265', 'learning_rate': '0.0001992', 'ppl': '2.965', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.4', 'tokens/total': 1826816, 'tokens/trainable': 1817402, 'epoch': '0.03926'}
  4%|███████▌                                                                                                                                                                                          | 223/5680 [31:56<12:21:29,  8.15s/it]  4%|███████▋                                                                                                                                                                                          | 224/5680 [32:04<12:18:45,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.9941', 'grad_norm': '0.2343', 'learning_rate': '0.0001992', 'ppl': '2.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 1835008, 'tokens/trainable': 1825556, 'epoch': '0.03944'}
  4%|███████▋                                                                                                                                                                                          | 224/5680 [32:04<12:18:45,  8.12s/it]  4%|███████▋                                                                                                                                                                                          | 225/5680 [32:13<12:17:49,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.5976', 'grad_norm': '0.1867', 'learning_rate': '0.0001992', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 1843200, 'tokens/trainable': 1833727, 'epoch': '0.03961'}
  4%|███████▋                                                                                                                                                                                          | 225/5680 [32:13<12:17:49,  8.12s/it]  4%|███████▋                                                                                                                                                                                          | 226/5680 [32:21<12:16:24,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.8674', 'grad_norm': '0.2294', 'learning_rate': '0.0001992', 'ppl': '2.381', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 1851392, 'tokens/trainable': 1841863, 'epoch': '0.03979'}
  4%|███████▋                                                                                                                                                                                          | 226/5680 [32:21<12:16:24,  8.10s/it]  4%|███████▊                                                                                                                                                                                          | 227/5680 [32:29<12:24:50,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.7178', 'grad_norm': '0.2048', 'learning_rate': '0.0001992', 'ppl': '2.05', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '970.2', 'tokens/total': 1859584, 'tokens/trainable': 1850025, 'epoch': '0.03996'}
  4%|███████▊                                                                                                                                                                                          | 227/5680 [32:29<12:24:50,  8.20s/it]  4%|███████▊                                                                                                                                                                                          | 228/5680 [32:37<12:31:24,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.9034', 'grad_norm': '0.2273', 'learning_rate': '0.0001992', 'ppl': '2.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '967.2', 'tokens/total': 1867776, 'tokens/trainable': 1858187, 'epoch': '0.04014'}
  4%|███████▊                                                                                                                                                                                          | 228/5680 [32:37<12:31:24,  8.27s/it]  4%|███████▊                                                                                                                                                                                          | 229/5680 [32:46<12:34:56,  8.31s/it]                                                                                                                                                                                                                                             {'loss': '0.8262', 'grad_norm': '0.2108', 'learning_rate': '0.0001992', 'ppl': '2.285', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '966.2', 'tokens/total': 1875968, 'tokens/trainable': 1866303, 'epoch': '0.04032'}
  4%|███████▊                                                                                                                                                                                          | 229/5680 [32:46<12:34:56,  8.31s/it]  4%|███████▊                                                                                                                                                                                          | 230/5680 [32:54<12:36:15,  8.33s/it]                                                                                                                                                                                                                                             {'loss': '0.9984', 'grad_norm': '0.237', 'learning_rate': '0.0001992', 'ppl': '2.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '976', 'tokens/total': 1884160, 'tokens/trainable': 1874462, 'epoch': '0.04049'}
  4%|███████▊                                                                                                                                                                                          | 230/5680 [32:54<12:36:15,  8.33s/it]  4%|███████▉                                                                                                                                                                                          | 231/5680 [33:02<12:32:43,  8.29s/it]                                                                                                                                                                                                                                             {'loss': '0.9313', 'grad_norm': '0.2423', 'learning_rate': '0.0001992', 'ppl': '2.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 1892352, 'tokens/trainable': 1882637, 'epoch': '0.04067'}
  4%|███████▉                                                                                                                                                                                          | 231/5680 [33:02<12:32:43,  8.29s/it]  4%|███████▉                                                                                                                                                                                          | 232/5680 [33:11<12:28:59,  8.25s/it]                                                                                                                                                                                                                                             {'loss': '0.7775', 'grad_norm': '0.2376', 'learning_rate': '0.0001992', 'ppl': '2.176', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.8', 'tokens/total': 1900544, 'tokens/trainable': 1890773, 'epoch': '0.04085'}
  4%|███████▉                                                                                                                                                                                          | 232/5680 [33:11<12:28:59,  8.25s/it]  4%|███████▉                                                                                                                                                                                          | 233/5680 [33:19<12:27:10,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.9555', 'grad_norm': '0.2512', 'learning_rate': '0.0001992', 'ppl': '2.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 1908736, 'tokens/trainable': 1898936, 'epoch': '0.04102'}
  4%|███████▉                                                                                                                                                                                          | 233/5680 [33:19<12:27:10,  8.23s/it]  4%|███████▉                                                                                                                                                                                          | 234/5680 [33:27<12:25:27,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '1.064', 'grad_norm': '0.249', 'learning_rate': '0.0001992', 'ppl': '2.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.1', 'tokens/total': 1916928, 'tokens/trainable': 1907074, 'epoch': '0.0412'}
  4%|███████▉                                                                                                                                                                                          | 234/5680 [33:27<12:25:27,  8.21s/it]  4%|████████                                                                                                                                                                                          | 235/5680 [33:35<12:24:25,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.7926', 'grad_norm': '0.2065', 'learning_rate': '0.0001992', 'ppl': '2.209', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.9', 'tokens/total': 1925120, 'tokens/trainable': 1915186, 'epoch': '0.04137'}
  4%|████████                                                                                                                                                                                          | 235/5680 [33:35<12:24:25,  8.20s/it]  4%|████████                                                                                                                                                                                          | 236/5680 [33:43<12:21:58,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.8302', 'grad_norm': '0.2095', 'learning_rate': '0.0001992', 'ppl': '2.294', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 1933312, 'tokens/trainable': 1923310, 'epoch': '0.04155'}
  4%|████████                                                                                                                                                                                          | 236/5680 [33:43<12:21:58,  8.18s/it]  4%|████████                                                                                                                                                                                          | 237/5680 [33:51<12:22:09,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.7777', 'grad_norm': '0.206', 'learning_rate': '0.0001991', 'ppl': '2.177', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997', 'tokens/total': 1941504, 'tokens/trainable': 1931472, 'epoch': '0.04173'}
  4%|████████                                                                                                                                                                                          | 237/5680 [33:51<12:22:09,  8.18s/it]  4%|████████▏                                                                                                                                                                                         | 238/5680 [34:00<12:21:41,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.7033', 'grad_norm': '0.2257', 'learning_rate': '0.0001991', 'ppl': '2.02', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.1', 'tokens/total': 1949696, 'tokens/trainable': 1939615, 'epoch': '0.0419'}
  4%|████████▏                                                                                                                                                                                         | 238/5680 [34:00<12:21:41,  8.18s/it]  4%|████████▏                                                                                                                                                                                         | 239/5680 [34:08<12:22:44,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '1', 'grad_norm': '0.2288', 'learning_rate': '0.0001991', 'ppl': '2.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.9', 'tokens/total': 1957888, 'tokens/trainable': 1947717, 'epoch': '0.04208'}
  4%|████████▏                                                                                                                                                                                         | 239/5680 [34:08<12:22:44,  8.19s/it]  4%|████████▏                                                                                                                                                                                         | 240/5680 [34:16<12:24:13,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.9997', 'grad_norm': '0.2653', 'learning_rate': '0.0001991', 'ppl': '2.717', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.7', 'tokens/total': 1966080, 'tokens/trainable': 1955896, 'epoch': '0.04225'}
  4%|████████▏                                                                                                                                                                                         | 240/5680 [34:16<12:24:13,  8.21s/it]  4%|████████▏                                                                                                                                                                                         | 241/5680 [34:25<12:32:43,  8.30s/it]                                                                                                                                                                                                                                             {'loss': '1.5', 'grad_norm': '0.3197', 'learning_rate': '0.0001991', 'ppl': '4.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '960.7', 'tokens/total': 1974272, 'tokens/trainable': 1964073, 'epoch': '0.04243'}
  4%|████████▏                                                                                                                                                                                         | 241/5680 [34:25<12:32:43,  8.30s/it]  4%|████████▎                                                                                                                                                                                         | 242/5680 [34:33<12:35:07,  8.33s/it]                                                                                                                                                                                                                                             {'loss': '1.029', 'grad_norm': '0.2548', 'learning_rate': '0.0001991', 'ppl': '2.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '967.5', 'tokens/total': 1982464, 'tokens/trainable': 1972195, 'epoch': '0.04261'}
  4%|████████▎                                                                                                                                                                                         | 242/5680 [34:33<12:35:07,  8.33s/it]  4%|████████▎                                                                                                                                                                                         | 243/5680 [34:41<12:37:05,  8.35s/it]                                                                                                                                                                                                                                             {'loss': '0.8836', 'grad_norm': '0.2102', 'learning_rate': '0.0001991', 'ppl': '2.42', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.8', 'tokens/total': 1990656, 'tokens/trainable': 1980386, 'epoch': '0.04278'}
  4%|████████▎                                                                                                                                                                                         | 243/5680 [34:41<12:37:05,  8.35s/it]  4%|████████▎                                                                                                                                                                                         | 244/5680 [34:50<12:38:18,  8.37s/it]                                                                                                                                                                                                                                             {'loss': '0.9158', 'grad_norm': '0.2682', 'learning_rate': '0.0001991', 'ppl': '2.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.7', 'tokens/total': 1998848, 'tokens/trainable': 1988556, 'epoch': '0.04296'}
  4%|████████▎                                                                                                                                                                                         | 244/5680 [34:50<12:38:18,  8.37s/it]  4%|████████▎                                                                                                                                                                                         | 245/5680 [34:58<12:35:13,  8.34s/it]                                                                                                                                                                                                                                             {'loss': '0.7766', 'grad_norm': '0.214', 'learning_rate': '0.0001991', 'ppl': '2.174', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.5', 'tokens/total': 2007040, 'tokens/trainable': 1996694, 'epoch': '0.04313'}
  4%|████████▎                                                                                                                                                                                         | 245/5680 [34:58<12:35:13,  8.34s/it]  4%|████████▍                                                                                                                                                                                         | 246/5680 [35:06<12:36:47,  8.36s/it]                                                                                                                                                                                                                                             {'loss': '0.5938', 'grad_norm': '0.2025', 'learning_rate': '0.0001991', 'ppl': '1.811', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '967', 'tokens/total': 2015232, 'tokens/trainable': 2004814, 'epoch': '0.04331'}
  4%|████████▍                                                                                                                                                                                         | 246/5680 [35:06<12:36:47,  8.36s/it]  4%|████████▍                                                                                                                                                                                         | 247/5680 [35:15<12:36:32,  8.35s/it]                                                                                                                                                                                                                                             {'loss': '0.8426', 'grad_norm': '0.2423', 'learning_rate': '0.0001991', 'ppl': '2.322', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '973.1', 'tokens/total': 2023424, 'tokens/trainable': 2012939, 'epoch': '0.04349'}
  4%|████████▍                                                                                                                                                                                         | 247/5680 [35:15<12:36:32,  8.35s/it]  4%|████████▍                                                                                                                                                                                         | 248/5680 [35:23<12:38:32,  8.38s/it]                                                                                                                                                                                                                                             {'loss': '0.5802', 'grad_norm': '0.1919', 'learning_rate': '0.0001991', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.4', 'tokens/total': 2031616, 'tokens/trainable': 2021064, 'epoch': '0.04366'}
  4%|████████▍                                                                                                                                                                                         | 248/5680 [35:23<12:38:32,  8.38s/it]  4%|████████▌                                                                                                                                                                                         | 249/5680 [35:32<12:41:34,  8.41s/it]                                                                                                                                                                                                                                             {'loss': '0.7376', 'grad_norm': '0.2239', 'learning_rate': '0.0001991', 'ppl': '2.091', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '963.9', 'tokens/total': 2039808, 'tokens/trainable': 2029249, 'epoch': '0.04384'}
  4%|████████▌                                                                                                                                                                                         | 249/5680 [35:32<12:41:34,  8.41s/it]  4%|████████▌                                                                                                                                                                                         | 250/5680 [35:40<12:42:31,  8.43s/it]                                                                                                                                                                                                                                             {'loss': '0.8052', 'grad_norm': '0.2059', 'learning_rate': '0.0001991', 'ppl': '2.237', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '961.6', 'tokens/total': 2048000, 'tokens/trainable': 2037376, 'epoch': '0.04401'}
  4%|████████▌                                                                                                                                                                                         | 250/5680 [35:40<12:42:31,  8.43s/it]  4%|████████▌                                                                                                                                                                                         | 251/5680 [35:49<12:43:27,  8.44s/it]                                                                                                                                                                                                                                             {'loss': '1.064', 'grad_norm': '0.2386', 'learning_rate': '0.000199', 'ppl': '2.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '963.4', 'tokens/total': 2056192, 'tokens/trainable': 2045528, 'epoch': '0.04419'}
  4%|████████▌                                                                                                                                                                                         | 251/5680 [35:49<12:43:27,  8.44s/it]  4%|████████▌                                                                                                                                                                                         | 252/5680 [35:57<12:40:54,  8.41s/it]                                                                                                                                                                                                                                             {'loss': '0.9658', 'grad_norm': '0.2443', 'learning_rate': '0.000199', 'ppl': '2.627', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.5', 'tokens/total': 2064384, 'tokens/trainable': 2053702, 'epoch': '0.04437'}
  4%|████████▌                                                                                                                                                                                         | 252/5680 [35:57<12:40:54,  8.41s/it]  4%|████████▋                                                                                                                                                                                         | 253/5680 [36:05<12:39:21,  8.40s/it]                                                                                                                                                                                                                                             {'loss': '0.9207', 'grad_norm': '0.2267', 'learning_rate': '0.000199', 'ppl': '2.511', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '966.3', 'tokens/total': 2072576, 'tokens/trainable': 2061772, 'epoch': '0.04454'}
  4%|████████▋                                                                                                                                                                                         | 253/5680 [36:05<12:39:21,  8.40s/it]  4%|████████▋                                                                                                                                                                                         | 254/5680 [36:14<12:36:38,  8.37s/it]                                                                                                                                                                                                                                             {'loss': '1.07', 'grad_norm': '0.2553', 'learning_rate': '0.000199', 'ppl': '2.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.6', 'tokens/total': 2080768, 'tokens/trainable': 2069917, 'epoch': '0.04472'}
  4%|████████▋                                                                                                                                                                                         | 254/5680 [36:14<12:36:38,  8.37s/it]  4%|████████▋                                                                                                                                                                                         | 255/5680 [36:22<12:37:33,  8.38s/it]                                                                                                                                                                                                                                             {'loss': '0.4542', 'grad_norm': '0.1777', 'learning_rate': '0.000199', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '965.1', 'tokens/total': 2088960, 'tokens/trainable': 2078027, 'epoch': '0.04489'}
  4%|████████▋                                                                                                                                                                                         | 255/5680 [36:22<12:37:33,  8.38s/it]  5%|████████▋                                                                                                                                                                                         | 256/5680 [36:31<12:39:53,  8.41s/it]                                                                                                                                                                                                                                             {'loss': '0.7808', 'grad_norm': '0.2093', 'learning_rate': '0.000199', 'ppl': '2.183', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '965.1', 'tokens/total': 2097152, 'tokens/trainable': 2086199, 'epoch': '0.04507'}
  5%|████████▋                                                                                                                                                                                         | 256/5680 [36:31<12:39:53,  8.41s/it]  5%|████████▊                                                                                                                                                                                         | 257/5680 [36:39<12:33:14,  8.33s/it]                                                                                                                                                                                                                                             {'loss': '0.4081', 'grad_norm': '0.1924', 'learning_rate': '0.000199', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2105344, 'tokens/trainable': 2094373, 'epoch': '0.04525'}
  5%|████████▊                                                                                                                                                                                         | 257/5680 [36:39<12:33:14,  8.33s/it]  5%|████████▊                                                                                                                                                                                         | 258/5680 [36:47<12:27:17,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.7653', 'grad_norm': '0.2447', 'learning_rate': '0.000199', 'ppl': '2.15', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 2113536, 'tokens/trainable': 2102555, 'epoch': '0.04542'}
  5%|████████▊                                                                                                                                                                                         | 258/5680 [36:47<12:27:17,  8.27s/it]  5%|████████▊                                                                                                                                                                                         | 259/5680 [36:55<12:23:39,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.6695', 'grad_norm': '0.2096', 'learning_rate': '0.000199', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2121728, 'tokens/trainable': 2110702, 'epoch': '0.0456'}
  5%|████████▊                                                                                                                                                                                         | 259/5680 [36:55<12:23:39,  8.23s/it]  5%|████████▉                                                                                                                                                                                         | 260/5680 [37:03<12:22:07,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.754', 'grad_norm': '0.2149', 'learning_rate': '0.000199', 'ppl': '2.125', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.2', 'tokens/total': 2129920, 'tokens/trainable': 2118824, 'epoch': '0.04577'}
  5%|████████▉                                                                                                                                                                                         | 260/5680 [37:03<12:22:07,  8.22s/it]  5%|████████▉                                                                                                                                                                                         | 261/5680 [37:11<12:20:57,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.757', 'grad_norm': '0.2935', 'learning_rate': '0.000199', 'ppl': '2.132', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.9', 'tokens/total': 2138112, 'tokens/trainable': 2126964, 'epoch': '0.04595'}
  5%|████████▉                                                                                                                                                                                         | 261/5680 [37:11<12:20:57,  8.20s/it]  5%|████████▉                                                                                                                                                                                         | 262/5680 [37:19<12:18:38,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.8383', 'grad_norm': '0.2261', 'learning_rate': '0.000199', 'ppl': '2.312', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.6', 'tokens/total': 2146304, 'tokens/trainable': 2135041, 'epoch': '0.04613'}
  5%|████████▉                                                                                                                                                                                         | 262/5680 [37:19<12:18:38,  8.18s/it]  5%|████████▉                                                                                                                                                                                         | 263/5680 [37:28<12:17:29,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '1.173', 'grad_norm': '0.2612', 'learning_rate': '0.000199', 'ppl': '3.232', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2154496, 'tokens/trainable': 2143187, 'epoch': '0.0463'}
  5%|████████▉                                                                                                                                                                                         | 263/5680 [37:28<12:17:29,  8.17s/it]  5%|█████████                                                                                                                                                                                         | 264/5680 [37:36<12:16:12,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.5785', 'grad_norm': '0.2011', 'learning_rate': '0.0001989', 'ppl': '1.783', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 2162688, 'tokens/trainable': 2151359, 'epoch': '0.04648'}
  5%|█████████                                                                                                                                                                                         | 264/5680 [37:36<12:16:12,  8.16s/it]  5%|█████████                                                                                                                                                                                         | 265/5680 [37:45<12:36:23,  8.38s/it]                                                                                                                                                                                                                                             {'loss': '0.939', 'grad_norm': '0.2417', 'learning_rate': '0.0001989', 'ppl': '2.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '916.7', 'tokens/total': 2170880, 'tokens/trainable': 2159521, 'epoch': '0.04665'}
  5%|█████████                                                                                                                                                                                         | 265/5680 [37:45<12:36:23,  8.38s/it]  5%|█████████                                                                                                                                                                                         | 266/5680 [37:53<12:29:36,  8.31s/it]                                                                                                                                                                                                                                             {'loss': '0.8128', 'grad_norm': '0.2251', 'learning_rate': '0.0001989', 'ppl': '2.254', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 2179072, 'tokens/trainable': 2167688, 'epoch': '0.04683'}
  5%|█████████                                                                                                                                                                                         | 266/5680 [37:53<12:29:36,  8.31s/it]  5%|█████████                                                                                                                                                                                         | 267/5680 [38:01<12:23:41,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.6849', 'grad_norm': '0.2137', 'learning_rate': '0.0001989', 'ppl': '1.984', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 2187264, 'tokens/trainable': 2175861, 'epoch': '0.04701'}
  5%|█████████                                                                                                                                                                                         | 267/5680 [38:01<12:23:41,  8.24s/it]  5%|█████████▏                                                                                                                                                                                        | 268/5680 [38:09<12:17:51,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.9', 'grad_norm': '0.225', 'learning_rate': '0.0001989', 'ppl': '2.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 2195456, 'tokens/trainable': 2184006, 'epoch': '0.04718'}
  5%|█████████▏                                                                                                                                                                                        | 268/5680 [38:09<12:17:51,  8.18s/it]  5%|█████████▏                                                                                                                                                                                        | 269/5680 [38:17<12:27:11,  8.29s/it]                                                                                                                                                                                                                                             {'loss': '0.5001', 'grad_norm': '0.2081', 'learning_rate': '0.0001989', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '956.4', 'tokens/total': 2203648, 'tokens/trainable': 2192162, 'epoch': '0.04736'}
  5%|█████████▏                                                                                                                                                                                        | 269/5680 [38:17<12:27:11,  8.29s/it]  5%|█████████▏                                                                                                                                                                                        | 270/5680 [38:26<12:22:55,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.53', 'grad_norm': '0.199', 'learning_rate': '0.0001989', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 2211840, 'tokens/trainable': 2200337, 'epoch': '0.04754'}
  5%|█████████▏                                                                                                                                                                                        | 270/5680 [38:26<12:22:55,  8.24s/it]  5%|█████████▎                                                                                                                                                                                        | 271/5680 [38:34<12:19:26,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.9894', 'grad_norm': '0.2874', 'learning_rate': '0.0001989', 'ppl': '2.69', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 2220032, 'tokens/trainable': 2208506, 'epoch': '0.04771'}
  5%|█████████▎                                                                                                                                                                                        | 271/5680 [38:34<12:19:26,  8.20s/it]  5%|█████████▎                                                                                                                                                                                        | 272/5680 [38:42<12:17:47,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.5616', 'grad_norm': '0.1916', 'learning_rate': '0.0001989', 'ppl': '1.753', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.3', 'tokens/total': 2228224, 'tokens/trainable': 2216596, 'epoch': '0.04789'}
  5%|█████████▎                                                                                                                                                                                        | 272/5680 [38:42<12:17:47,  8.19s/it]  5%|█████████▎                                                                                                                                                                                        | 273/5680 [38:50<12:23:42,  8.25s/it]                                                                                                                                                                                                                                             {'loss': '0.8124', 'grad_norm': '0.2369', 'learning_rate': '0.0001989', 'ppl': '2.253', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '966', 'tokens/total': 2236416, 'tokens/trainable': 2224715, 'epoch': '0.04806'}
  5%|█████████▎                                                                                                                                                                                        | 273/5680 [38:50<12:23:42,  8.25s/it]  5%|█████████▎                                                                                                                                                                                        | 274/5680 [38:58<12:19:26,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '1.022', 'grad_norm': '0.2729', 'learning_rate': '0.0001989', 'ppl': '2.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 2244608, 'tokens/trainable': 2232865, 'epoch': '0.04824'}
  5%|█████████▎                                                                                                                                                                                        | 274/5680 [38:58<12:19:26,  8.21s/it]  5%|█████████▍                                                                                                                                                                                        | 275/5680 [39:07<12:22:28,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.8068', 'grad_norm': '0.2294', 'learning_rate': '0.0001989', 'ppl': '2.241', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978', 'tokens/total': 2252800, 'tokens/trainable': 2241004, 'epoch': '0.04842'}
  5%|█████████▍                                                                                                                                                                                        | 275/5680 [39:07<12:22:28,  8.24s/it]  5%|█████████▍                                                                                                                                                                                        | 276/5680 [39:15<12:22:53,  8.25s/it]                                                                                                                                                                                                                                             {'loss': '0.6955', 'grad_norm': '0.2724', 'learning_rate': '0.0001988', 'ppl': '2.005', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.6', 'tokens/total': 2260992, 'tokens/trainable': 2249112, 'epoch': '0.04859'}
  5%|█████████▍                                                                                                                                                                                        | 276/5680 [39:15<12:22:53,  8.25s/it]  5%|█████████▍                                                                                                                                                                                        | 277/5680 [39:23<12:24:26,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.7556', 'grad_norm': '0.2366', 'learning_rate': '0.0001988', 'ppl': '2.129', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.4', 'tokens/total': 2269184, 'tokens/trainable': 2257233, 'epoch': '0.04877'}
  5%|█████████▍                                                                                                                                                                                        | 277/5680 [39:23<12:24:26,  8.27s/it]  5%|█████████▍                                                                                                                                                                                        | 278/5680 [39:32<12:26:22,  8.29s/it]                                                                                                                                                                                                                                             {'loss': '0.7525', 'grad_norm': '0.2431', 'learning_rate': '0.0001988', 'ppl': '2.122', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.5', 'tokens/total': 2277376, 'tokens/trainable': 2265403, 'epoch': '0.04894'}
  5%|█████████▍                                                                                                                                                                                        | 278/5680 [39:32<12:26:22,  8.29s/it]  5%|█████████▌                                                                                                                                                                                        | 279/5680 [39:40<12:21:54,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '1.029', 'grad_norm': '0.2758', 'learning_rate': '0.0001988', 'ppl': '2.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 2285568, 'tokens/trainable': 2273577, 'epoch': '0.04912'}
  5%|█████████▌                                                                                                                                                                                        | 279/5680 [39:40<12:21:54,  8.24s/it]  5%|█████████▌                                                                                                                                                                                        | 280/5680 [39:48<12:16:56,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.6815', 'grad_norm': '0.2148', 'learning_rate': '0.0001988', 'ppl': '1.977', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 2293760, 'tokens/trainable': 2281700, 'epoch': '0.0493'}
  5%|█████████▌                                                                                                                                                                                        | 280/5680 [39:48<12:16:56,  8.19s/it]  5%|█████████▌                                                                                                                                                                                        | 281/5680 [39:56<12:19:12,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.8831', 'grad_norm': '0.2321', 'learning_rate': '0.0001988', 'ppl': '2.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.8', 'tokens/total': 2301952, 'tokens/trainable': 2289848, 'epoch': '0.04947'}
  5%|█████████▌                                                                                                                                                                                        | 281/5680 [39:56<12:19:12,  8.21s/it]  5%|█████████▋                                                                                                                                                                                        | 282/5680 [40:04<12:16:39,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '1.184', 'grad_norm': '0.2727', 'learning_rate': '0.0001988', 'ppl': '3.268', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2310144, 'tokens/trainable': 2297982, 'epoch': '0.04965'}
  5%|█████████▋                                                                                                                                                                                        | 282/5680 [40:04<12:16:39,  8.19s/it]  5%|█████████▋                                                                                                                                                                                        | 283/5680 [40:12<12:16:12,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.5811', 'grad_norm': '0.1942', 'learning_rate': '0.0001988', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.5', 'tokens/total': 2318336, 'tokens/trainable': 2306136, 'epoch': '0.04982'}
  5%|█████████▋                                                                                                                                                                                        | 283/5680 [40:12<12:16:12,  8.18s/it]  5%|█████████▋                                                                                                                                                                                        | 284/5680 [40:21<12:16:10,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.7542', 'grad_norm': '0.2062', 'learning_rate': '0.0001988', 'ppl': '2.126', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.6', 'tokens/total': 2326528, 'tokens/trainable': 2314277, 'epoch': '0.05'}
  5%|█████████▋                                                                                                                                                                                        | 284/5680 [40:21<12:16:10,  8.19s/it]  5%|█████████▋                                                                                                                                                                                        | 285/5680 [40:29<12:16:55,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.6407', 'grad_norm': '0.2031', 'learning_rate': '0.0001988', 'ppl': '1.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.6', 'tokens/total': 2334720, 'tokens/trainable': 2322441, 'epoch': '0.05018'}
  5%|█████████▋                                                                                                                                                                                        | 285/5680 [40:29<12:16:55,  8.20s/it]  5%|█████████▊                                                                                                                                                                                        | 286/5680 [40:37<12:12:52,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.6252', 'grad_norm': '0.1918', 'learning_rate': '0.0001988', 'ppl': '1.869', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 2342912, 'tokens/trainable': 2330631, 'epoch': '0.05035'}
  5%|█████████▊                                                                                                                                                                                        | 286/5680 [40:37<12:12:52,  8.15s/it]  5%|█████████▊                                                                                                                                                                                        | 287/5680 [40:45<12:10:39,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.8046', 'grad_norm': '0.2072', 'learning_rate': '0.0001988', 'ppl': '2.236', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 2351104, 'tokens/trainable': 2338788, 'epoch': '0.05053'}
  5%|█████████▊                                                                                                                                                                                        | 287/5680 [40:45<12:10:39,  8.13s/it]  5%|█████████▊                                                                                                                                                                                        | 288/5680 [40:53<12:09:34,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.8113', 'grad_norm': '0.2094', 'learning_rate': '0.0001987', 'ppl': '2.251', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 2359296, 'tokens/trainable': 2346921, 'epoch': '0.0507'}
  5%|█████████▊                                                                                                                                                                                        | 288/5680 [40:53<12:09:34,  8.12s/it]  5%|█████████▊                                                                                                                                                                                        | 289/5680 [41:01<12:12:11,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '1.035', 'grad_norm': '0.2419', 'learning_rate': '0.0001987', 'ppl': '2.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.1', 'tokens/total': 2367488, 'tokens/trainable': 2355057, 'epoch': '0.05088'}
  5%|█████████▊                                                                                                                                                                                        | 289/5680 [41:01<12:12:11,  8.15s/it]  5%|█████████▉                                                                                                                                                                                        | 290/5680 [41:09<12:15:46,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.6183', 'grad_norm': '0.1959', 'learning_rate': '0.0001987', 'ppl': '1.856', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.5', 'tokens/total': 2375680, 'tokens/trainable': 2363196, 'epoch': '0.05106'}
  5%|█████████▉                                                                                                                                                                                        | 290/5680 [41:09<12:15:46,  8.19s/it]  5%|█████████▉                                                                                                                                                                                        | 291/5680 [41:18<12:24:32,  8.29s/it]                                                                                                                                                                                                                                             {'loss': '0.6282', 'grad_norm': '0.1954', 'learning_rate': '0.0001987', 'ppl': '1.874', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '954.6', 'tokens/total': 2383872, 'tokens/trainable': 2371327, 'epoch': '0.05123'}
  5%|█████████▉                                                                                                                                                                                        | 291/5680 [41:18<12:24:32,  8.29s/it]  5%|█████████▉                                                                                                                                                                                        | 292/5680 [41:26<12:22:48,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.7026', 'grad_norm': '0.2245', 'learning_rate': '0.0001987', 'ppl': '2.019', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.9', 'tokens/total': 2392064, 'tokens/trainable': 2379487, 'epoch': '0.05141'}
  5%|█████████▉                                                                                                                                                                                        | 292/5680 [41:26<12:22:48,  8.27s/it]  5%|██████████                                                                                                                                                                                        | 293/5680 [41:34<12:19:30,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.6003', 'grad_norm': '0.1917', 'learning_rate': '0.0001987', 'ppl': '1.823', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.6', 'tokens/total': 2400256, 'tokens/trainable': 2387587, 'epoch': '0.05158'}
  5%|██████████                                                                                                                                                                                        | 293/5680 [41:34<12:19:30,  8.24s/it]  5%|██████████                                                                                                                                                                                        | 294/5680 [41:43<12:17:55,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.7303', 'grad_norm': '0.262', 'learning_rate': '0.0001987', 'ppl': '2.076', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.1', 'tokens/total': 2408448, 'tokens/trainable': 2395744, 'epoch': '0.05176'}
  5%|██████████                                                                                                                                                                                        | 294/5680 [41:43<12:17:55,  8.22s/it]  5%|██████████                                                                                                                                                                                        | 295/5680 [41:51<12:18:11,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.6993', 'grad_norm': '0.213', 'learning_rate': '0.0001987', 'ppl': '2.012', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.5', 'tokens/total': 2416640, 'tokens/trainable': 2403882, 'epoch': '0.05194'}
  5%|██████████                                                                                                                                                                                        | 295/5680 [41:51<12:18:11,  8.22s/it]  5%|██████████                                                                                                                                                                                        | 296/5680 [41:59<12:17:52,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.9769', 'grad_norm': '0.2303', 'learning_rate': '0.0001987', 'ppl': '2.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.9', 'tokens/total': 2424832, 'tokens/trainable': 2412056, 'epoch': '0.05211'}
  5%|██████████                                                                                                                                                                                        | 296/5680 [41:59<12:17:52,  8.22s/it]  5%|██████████▏                                                                                                                                                                                       | 297/5680 [42:07<12:16:49,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.8823', 'grad_norm': '0.2159', 'learning_rate': '0.0001987', 'ppl': '2.416', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.4', 'tokens/total': 2433024, 'tokens/trainable': 2420178, 'epoch': '0.05229'}
  5%|██████████▏                                                                                                                                                                                       | 297/5680 [42:07<12:16:49,  8.21s/it]  5%|██████████▏                                                                                                                                                                                       | 298/5680 [42:16<12:19:42,  8.25s/it]                                                                                                                                                                                                                                             {'loss': '0.508', 'grad_norm': '0.2027', 'learning_rate': '0.0001987', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.6', 'tokens/total': 2441216, 'tokens/trainable': 2428284, 'epoch': '0.05246'}
  5%|██████████▏                                                                                                                                                                                       | 298/5680 [42:16<12:19:42,  8.25s/it]  5%|██████████▏                                                                                                                                                                                       | 299/5680 [42:24<12:18:26,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.9213', 'grad_norm': '0.2426', 'learning_rate': '0.0001986', 'ppl': '2.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.6', 'tokens/total': 2449408, 'tokens/trainable': 2436376, 'epoch': '0.05264'}
  5%|██████████▏                                                                                                                                                                                       | 299/5680 [42:24<12:18:26,  8.23s/it]  5%|██████████▏                                                                                                                                                                                       | 300/5680 [42:32<12:16:05,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.7765', 'grad_norm': '0.2273', 'learning_rate': '0.0001986', 'ppl': '2.174', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.6', 'tokens/total': 2457600, 'tokens/trainable': 2444497, 'epoch': '0.05282'}
  5%|██████████▏                                                                                                                                                                                       | 300/5680 [42:32<12:16:05,  8.21s/it]  5%|██████████▎                                                                                                                                                                                       | 301/5680 [42:40<12:14:42,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.8697', 'grad_norm': '0.2289', 'learning_rate': '0.0001986', 'ppl': '2.386', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2465792, 'tokens/trainable': 2452667, 'epoch': '0.05299'}
  5%|██████████▎                                                                                                                                                                                       | 301/5680 [42:40<12:14:42,  8.20s/it]  5%|██████████▎                                                                                                                                                                                       | 302/5680 [42:48<12:15:08,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.7757', 'grad_norm': '0.225', 'learning_rate': '0.0001986', 'ppl': '2.172', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.8', 'tokens/total': 2473984, 'tokens/trainable': 2460844, 'epoch': '0.05317'}
  5%|██████████▎                                                                                                                                                                                       | 302/5680 [42:48<12:15:08,  8.20s/it]  5%|██████████▎                                                                                                                                                                                       | 303/5680 [42:56<12:13:39,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.6993', 'grad_norm': '0.2226', 'learning_rate': '0.0001986', 'ppl': '2.012', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2482176, 'tokens/trainable': 2468998, 'epoch': '0.05335'}
  5%|██████████▎                                                                                                                                                                                       | 303/5680 [42:56<12:13:39,  8.19s/it]  5%|██████████▍                                                                                                                                                                                       | 304/5680 [43:04<12:10:59,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.8307', 'grad_norm': '0.2464', 'learning_rate': '0.0001986', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 2490368, 'tokens/trainable': 2477163, 'epoch': '0.05352'}
  5%|██████████▍                                                                                                                                                                                       | 304/5680 [43:04<12:10:59,  8.16s/it]  5%|██████████▍                                                                                                                                                                                       | 305/5680 [43:13<12:09:56,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.6536', 'grad_norm': '0.2081', 'learning_rate': '0.0001986', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 2498560, 'tokens/trainable': 2485305, 'epoch': '0.0537'}
  5%|██████████▍                                                                                                                                                                                       | 305/5680 [43:13<12:09:56,  8.15s/it]  5%|██████████▍                                                                                                                                                                                       | 306/5680 [43:21<12:09:24,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.8897', 'grad_norm': '0.245', 'learning_rate': '0.0001986', 'ppl': '2.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.8', 'tokens/total': 2506752, 'tokens/trainable': 2493434, 'epoch': '0.05387'}
  5%|██████████▍                                                                                                                                                                                       | 306/5680 [43:21<12:09:24,  8.14s/it]  5%|██████████▍                                                                                                                                                                                       | 307/5680 [43:29<12:08:51,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.9419', 'grad_norm': '0.2498', 'learning_rate': '0.0001986', 'ppl': '2.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 2514944, 'tokens/trainable': 2501602, 'epoch': '0.05405'}
  5%|██████████▍                                                                                                                                                                                       | 307/5680 [43:29<12:08:51,  8.14s/it]  5%|██████████▌                                                                                                                                                                                       | 308/5680 [43:37<12:06:44,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.8908', 'grad_norm': '0.2456', 'learning_rate': '0.0001986', 'ppl': '2.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 2523136, 'tokens/trainable': 2509728, 'epoch': '0.05423'}
  5%|██████████▌                                                                                                                                                                                       | 308/5680 [43:37<12:06:44,  8.12s/it]  5%|██████████▌                                                                                                                                                                                       | 309/5680 [43:45<12:06:56,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.7935', 'grad_norm': '0.2138', 'learning_rate': '0.0001986', 'ppl': '2.211', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 2531328, 'tokens/trainable': 2517907, 'epoch': '0.0544'}
  5%|██████████▌                                                                                                                                                                                       | 309/5680 [43:45<12:06:56,  8.12s/it]  5%|██████████▌                                                                                                                                                                                       | 310/5680 [43:53<12:06:56,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.579', 'grad_norm': '0.2186', 'learning_rate': '0.0001985', 'ppl': '1.784', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 2539520, 'tokens/trainable': 2526049, 'epoch': '0.05458'}
  5%|██████████▌                                                                                                                                                                                       | 310/5680 [43:53<12:06:56,  8.12s/it]  5%|██████████▌                                                                                                                                                                                       | 311/5680 [44:01<12:09:43,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.4561', 'grad_norm': '0.1728', 'learning_rate': '0.0001985', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.3', 'tokens/total': 2547712, 'tokens/trainable': 2534221, 'epoch': '0.05475'}
  5%|██████████▌                                                                                                                                                                                       | 311/5680 [44:01<12:09:43,  8.15s/it]  5%|██████████▋                                                                                                                                                                                       | 312/5680 [44:10<12:11:56,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.8416', 'grad_norm': '0.2194', 'learning_rate': '0.0001985', 'ppl': '2.32', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993', 'tokens/total': 2555904, 'tokens/trainable': 2542403, 'epoch': '0.05493'}
  5%|██████████▋                                                                                                                                                                                       | 312/5680 [44:10<12:11:56,  8.18s/it]  6%|██████████▋                                                                                                                                                                                       | 313/5680 [44:18<12:10:38,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8103', 'grad_norm': '0.2217', 'learning_rate': '0.0001985', 'ppl': '2.249', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.8', 'tokens/total': 2564096, 'tokens/trainable': 2550504, 'epoch': '0.05511'}
  6%|██████████▋                                                                                                                                                                                       | 313/5680 [44:18<12:10:38,  8.17s/it]  6%|██████████▋                                                                                                                                                                                       | 314/5680 [44:26<12:10:07,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.7999', 'grad_norm': '0.2334', 'learning_rate': '0.0001985', 'ppl': '2.225', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.8', 'tokens/total': 2572288, 'tokens/trainable': 2558653, 'epoch': '0.05528'}
  6%|██████████▋                                                                                                                                                                                       | 314/5680 [44:26<12:10:07,  8.16s/it]  6%|██████████▊                                                                                                                                                                                       | 315/5680 [44:34<12:11:33,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.6448', 'grad_norm': '0.2079', 'learning_rate': '0.0001985', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.5', 'tokens/total': 2580480, 'tokens/trainable': 2566777, 'epoch': '0.05546'}
  6%|██████████▊                                                                                                                                                                                       | 315/5680 [44:34<12:11:33,  8.18s/it]  6%|██████████▊                                                                                                                                                                                       | 316/5680 [44:42<12:10:59,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.6306', 'grad_norm': '0.1907', 'learning_rate': '0.0001985', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.3', 'tokens/total': 2588672, 'tokens/trainable': 2574926, 'epoch': '0.05563'}
  6%|██████████▊                                                                                                                                                                                       | 316/5680 [44:42<12:10:59,  8.18s/it]  6%|██████████▊                                                                                                                                                                                       | 317/5680 [44:51<12:09:58,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7366', 'grad_norm': '0.2063', 'learning_rate': '0.0001985', 'ppl': '2.089', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 2596864, 'tokens/trainable': 2583113, 'epoch': '0.05581'}
  6%|██████████▊                                                                                                                                                                                       | 317/5680 [44:51<12:09:58,  8.17s/it]  6%|██████████▊                                                                                                                                                                                       | 318/5680 [44:59<12:06:35,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.6705', 'grad_norm': '0.2338', 'learning_rate': '0.0001985', 'ppl': '1.955', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 2605056, 'tokens/trainable': 2591299, 'epoch': '0.05599'}
  6%|██████████▊                                                                                                                                                                                       | 318/5680 [44:59<12:06:35,  8.13s/it]  6%|██████████▉                                                                                                                                                                                       | 319/5680 [45:07<12:07:23,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.8388', 'grad_norm': '0.2192', 'learning_rate': '0.0001985', 'ppl': '2.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.5', 'tokens/total': 2613248, 'tokens/trainable': 2599457, 'epoch': '0.05616'}
  6%|██████████▉                                                                                                                                                                                       | 319/5680 [45:07<12:07:23,  8.14s/it]  6%|██████████▉                                                                                                                                                                                       | 320/5680 [45:15<12:08:48,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.605', 'grad_norm': '0.1995', 'learning_rate': '0.0001984', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.2', 'tokens/total': 2621440, 'tokens/trainable': 2607614, 'epoch': '0.05634'}
  6%|██████████▉                                                                                                                                                                                       | 320/5680 [45:15<12:08:48,  8.16s/it]  6%|██████████▉                                                                                                                                                                                       | 321/5680 [45:23<12:09:58,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8268', 'grad_norm': '0.271', 'learning_rate': '0.0001984', 'ppl': '2.286', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.3', 'tokens/total': 2629632, 'tokens/trainable': 2615754, 'epoch': '0.05651'}
  6%|██████████▉                                                                                                                                                                                       | 321/5680 [45:23<12:09:58,  8.17s/it]  6%|██████████▉                                                                                                                                                                                       | 322/5680 [45:31<12:09:18,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.9443', 'grad_norm': '0.2592', 'learning_rate': '0.0001984', 'ppl': '2.571', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.2', 'tokens/total': 2637824, 'tokens/trainable': 2623825, 'epoch': '0.05669'}
  6%|██████████▉                                                                                                                                                                                       | 322/5680 [45:31<12:09:18,  8.17s/it]  6%|███████████                                                                                                                                                                                       | 323/5680 [45:39<12:08:55,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.7857', 'grad_norm': '0.2106', 'learning_rate': '0.0001984', 'ppl': '2.194', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.5', 'tokens/total': 2646016, 'tokens/trainable': 2631928, 'epoch': '0.05687'}
  6%|███████████                                                                                                                                                                                       | 323/5680 [45:39<12:08:55,  8.16s/it]  6%|███████████                                                                                                                                                                                       | 324/5680 [45:48<12:06:58,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.8064', 'grad_norm': '0.2055', 'learning_rate': '0.0001984', 'ppl': '2.24', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 2654208, 'tokens/trainable': 2640077, 'epoch': '0.05704'}
  6%|███████████                                                                                                                                                                                       | 324/5680 [45:48<12:06:58,  8.14s/it]  6%|███████████                                                                                                                                                                                       | 325/5680 [45:56<12:09:06,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8571', 'grad_norm': '0.2365', 'learning_rate': '0.0001984', 'ppl': '2.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.4', 'tokens/total': 2662400, 'tokens/trainable': 2648233, 'epoch': '0.05722'}
  6%|███████████                                                                                                                                                                                       | 325/5680 [45:56<12:09:06,  8.17s/it]  6%|███████████▏                                                                                                                                                                                      | 326/5680 [46:04<12:06:18,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.7688', 'grad_norm': '0.2329', 'learning_rate': '0.0001984', 'ppl': '2.157', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 2670592, 'tokens/trainable': 2656413, 'epoch': '0.05739'}
  6%|███████████▏                                                                                                                                                                                      | 326/5680 [46:04<12:06:18,  8.14s/it]  6%|███████████▏                                                                                                                                                                                      | 327/5680 [46:12<12:07:32,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.5894', 'grad_norm': '0.2198', 'learning_rate': '0.0001984', 'ppl': '1.803', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.8', 'tokens/total': 2678784, 'tokens/trainable': 2664518, 'epoch': '0.05757'}
  6%|███████████▏                                                                                                                                                                                      | 327/5680 [46:12<12:07:32,  8.15s/it]  6%|███████████▏                                                                                                                                                                                      | 328/5680 [46:20<12:08:36,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.4994', 'grad_norm': '0.1906', 'learning_rate': '0.0001984', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.5', 'tokens/total': 2686976, 'tokens/trainable': 2672704, 'epoch': '0.05775'}
  6%|███████████▏                                                                                                                                                                                      | 328/5680 [46:20<12:08:36,  8.17s/it]  6%|███████████▏                                                                                                                                                                                      | 329/5680 [46:28<12:09:45,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.8733', 'grad_norm': '0.2503', 'learning_rate': '0.0001984', 'ppl': '2.395', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.2', 'tokens/total': 2695168, 'tokens/trainable': 2680887, 'epoch': '0.05792'}
  6%|███████████▏                                                                                                                                                                                      | 329/5680 [46:28<12:09:45,  8.18s/it]  6%|███████████▎                                                                                                                                                                                      | 330/5680 [46:37<12:10:09,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.9365', 'grad_norm': '0.2463', 'learning_rate': '0.0001983', 'ppl': '2.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.7', 'tokens/total': 2703360, 'tokens/trainable': 2689043, 'epoch': '0.0581'}
  6%|███████████▎                                                                                                                                                                                      | 330/5680 [46:37<12:10:09,  8.19s/it]  6%|███████████▎                                                                                                                                                                                      | 331/5680 [46:45<12:10:06,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.5753', 'grad_norm': '0.2033', 'learning_rate': '0.0001983', 'ppl': '1.778', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.4', 'tokens/total': 2711552, 'tokens/trainable': 2697119, 'epoch': '0.05827'}
  6%|███████████▎                                                                                                                                                                                      | 331/5680 [46:45<12:10:06,  8.19s/it]  6%|███████████▎                                                                                                                                                                                      | 332/5680 [46:53<12:09:12,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.7861', 'grad_norm': '0.2408', 'learning_rate': '0.0001983', 'ppl': '2.195', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2719744, 'tokens/trainable': 2705280, 'epoch': '0.05845'}
  6%|███████████▎                                                                                                                                                                                      | 332/5680 [46:53<12:09:12,  8.18s/it]  6%|███████████▎                                                                                                                                                                                      | 333/5680 [47:01<12:07:30,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.4625', 'grad_norm': '0.1732', 'learning_rate': '0.0001983', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2727936, 'tokens/trainable': 2713410, 'epoch': '0.05863'}
  6%|███████████▎                                                                                                                                                                                      | 333/5680 [47:01<12:07:30,  8.16s/it]  6%|███████████▍                                                                                                                                                                                      | 334/5680 [47:09<12:08:09,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8276', 'grad_norm': '0.2469', 'learning_rate': '0.0001983', 'ppl': '2.288', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.5', 'tokens/total': 2736128, 'tokens/trainable': 2721588, 'epoch': '0.0588'}
  6%|███████████▍                                                                                                                                                                                      | 334/5680 [47:09<12:08:09,  8.17s/it]  6%|███████████▍                                                                                                                                                                                      | 335/5680 [47:17<12:08:32,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.5027', 'grad_norm': '0.1877', 'learning_rate': '0.0001983', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.1', 'tokens/total': 2744320, 'tokens/trainable': 2729705, 'epoch': '0.05898'}
  6%|███████████▍                                                                                                                                                                                      | 335/5680 [47:17<12:08:32,  8.18s/it]  6%|███████████▍                                                                                                                                                                                      | 336/5680 [47:26<12:07:50,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.6451', 'grad_norm': '0.2047', 'learning_rate': '0.0001983', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.4', 'tokens/total': 2752512, 'tokens/trainable': 2737854, 'epoch': '0.05915'}
  6%|███████████▍                                                                                                                                                                                      | 336/5680 [47:26<12:07:50,  8.17s/it]  6%|███████████▌                                                                                                                                                                                      | 337/5680 [47:34<12:05:18,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.9237', 'grad_norm': '0.2251', 'learning_rate': '0.0001983', 'ppl': '2.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 2760704, 'tokens/trainable': 2746034, 'epoch': '0.05933'}
  6%|███████████▌                                                                                                                                                                                      | 337/5680 [47:34<12:05:18,  8.15s/it]  6%|███████████▌                                                                                                                                                                                      | 338/5680 [47:42<12:02:33,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.5467', 'grad_norm': '0.1849', 'learning_rate': '0.0001983', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 2768896, 'tokens/trainable': 2754195, 'epoch': '0.05951'}
  6%|███████████▌                                                                                                                                                                                      | 338/5680 [47:42<12:02:33,  8.12s/it]  6%|███████████▌                                                                                                                                                                                      | 339/5680 [47:50<12:01:00,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.8736', 'grad_norm': '0.2405', 'learning_rate': '0.0001983', 'ppl': '2.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 2777088, 'tokens/trainable': 2762354, 'epoch': '0.05968'}
  6%|███████████▌                                                                                                                                                                                      | 339/5680 [47:50<12:01:00,  8.10s/it]  6%|███████████▌                                                                                                                                                                                      | 340/5680 [47:58<12:04:00,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.6699', 'grad_norm': '0.2131', 'learning_rate': '0.0001982', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.6', 'tokens/total': 2785280, 'tokens/trainable': 2770459, 'epoch': '0.05986'}
  6%|███████████▌                                                                                                                                                                                      | 340/5680 [47:58<12:04:00,  8.13s/it]  6%|███████████▋                                                                                                                                                                                      | 341/5680 [48:07<12:14:54,  8.26s/it]                                                                                                                                                                                                                                             {'loss': '0.5888', 'grad_norm': '0.1857', 'learning_rate': '0.0001982', 'ppl': '1.802', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '952.4', 'tokens/total': 2793472, 'tokens/trainable': 2778598, 'epoch': '0.06004'}
  6%|███████████▋                                                                                                                                                                                      | 341/5680 [48:07<12:14:54,  8.26s/it]  6%|███████████▋                                                                                                                                                                                      | 342/5680 [48:15<12:15:27,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.8171', 'grad_norm': '0.2134', 'learning_rate': '0.0001982', 'ppl': '2.264', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.2', 'tokens/total': 2801664, 'tokens/trainable': 2786758, 'epoch': '0.06021'}
  6%|███████████▋                                                                                                                                                                                      | 342/5680 [48:15<12:15:27,  8.27s/it]  6%|███████████▋                                                                                                                                                                                      | 343/5680 [48:23<12:14:41,  8.26s/it]                                                                                                                                                                                                                                             {'loss': '0.8659', 'grad_norm': '0.2317', 'learning_rate': '0.0001982', 'ppl': '2.377', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.8', 'tokens/total': 2809856, 'tokens/trainable': 2794921, 'epoch': '0.06039'}
  6%|███████████▋                                                                                                                                                                                      | 343/5680 [48:23<12:14:41,  8.26s/it]  6%|███████████▋                                                                                                                                                                                      | 344/5680 [48:31<12:10:47,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.8716', 'grad_norm': '0.2113', 'learning_rate': '0.0001982', 'ppl': '2.391', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.9', 'tokens/total': 2818048, 'tokens/trainable': 2803026, 'epoch': '0.06056'}
  6%|███████████▋                                                                                                                                                                                      | 344/5680 [48:31<12:10:47,  8.22s/it]  6%|███████████▊                                                                                                                                                                                      | 345/5680 [48:39<12:11:02,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.6025', 'grad_norm': '0.2063', 'learning_rate': '0.0001982', 'ppl': '1.827', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.1', 'tokens/total': 2826240, 'tokens/trainable': 2811135, 'epoch': '0.06074'}
  6%|███████████▊                                                                                                                                                                                      | 345/5680 [48:39<12:11:02,  8.22s/it]  6%|███████████▊                                                                                                                                                                                      | 346/5680 [48:48<12:09:53,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.7571', 'grad_norm': '0.2146', 'learning_rate': '0.0001982', 'ppl': '2.132', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.1', 'tokens/total': 2834432, 'tokens/trainable': 2819236, 'epoch': '0.06092'}
  6%|███████████▊                                                                                                                                                                                      | 346/5680 [48:48<12:09:53,  8.21s/it]  6%|███████████▊                                                                                                                                                                                      | 347/5680 [48:56<12:07:30,  8.19s/it]                                                                                                                                                                                                                                             {'loss': '0.6198', 'grad_norm': '0.2248', 'learning_rate': '0.0001982', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 2842624, 'tokens/trainable': 2827377, 'epoch': '0.06109'}
  6%|███████████▊                                                                                                                                                                                      | 347/5680 [48:56<12:07:30,  8.19s/it]  6%|███████████▉                                                                                                                                                                                      | 348/5680 [49:04<12:05:13,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.7449', 'grad_norm': '0.2137', 'learning_rate': '0.0001982', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 2850816, 'tokens/trainable': 2835498, 'epoch': '0.06127'}
  6%|███████████▉                                                                                                                                                                                      | 348/5680 [49:04<12:05:13,  8.16s/it]  6%|███████████▉                                                                                                                                                                                      | 349/5680 [49:12<12:04:38,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.5515', 'grad_norm': '0.1904', 'learning_rate': '0.0001982', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 2859008, 'tokens/trainable': 2843641, 'epoch': '0.06144'}
  6%|███████████▉                                                                                                                                                                                      | 349/5680 [49:12<12:04:38,  8.16s/it]  6%|███████████▉                                                                                                                                                                                      | 350/5680 [49:20<12:04:22,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.8911', 'grad_norm': '0.2309', 'learning_rate': '0.0001981', 'ppl': '2.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2867200, 'tokens/trainable': 2851796, 'epoch': '0.06162'}
  6%|███████████▉                                                                                                                                                                                      | 350/5680 [49:20<12:04:22,  8.15s/it]  6%|███████████▉                                                                                                                                                                                      | 351/5680 [49:28<12:04:07,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.9079', 'grad_norm': '0.2139', 'learning_rate': '0.0001981', 'ppl': '2.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 2875392, 'tokens/trainable': 2859945, 'epoch': '0.0618'}
  6%|███████████▉                                                                                                                                                                                      | 351/5680 [49:28<12:04:07,  8.15s/it]  6%|████████████                                                                                                                                                                                      | 352/5680 [49:37<12:04:16,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.6134', 'grad_norm': '0.206', 'learning_rate': '0.0001981', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992', 'tokens/total': 2883584, 'tokens/trainable': 2868041, 'epoch': '0.06197'}
  6%|████████████                                                                                                                                                                                      | 352/5680 [49:37<12:04:16,  8.16s/it]  6%|████████████                                                                                                                                                                                      | 353/5680 [49:45<12:03:24,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.9637', 'grad_norm': '0.2225', 'learning_rate': '0.0001981', 'ppl': '2.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 2891776, 'tokens/trainable': 2876184, 'epoch': '0.06215'}
  6%|████████████                                                                                                                                                                                      | 353/5680 [49:45<12:03:24,  8.15s/it]  6%|████████████                                                                                                                                                                                      | 354/5680 [49:53<12:04:35,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.9298', 'grad_norm': '0.2201', 'learning_rate': '0.0001981', 'ppl': '2.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 2899968, 'tokens/trainable': 2884358, 'epoch': '0.06232'}
  6%|████████████                                                                                                                                                                                      | 354/5680 [49:53<12:04:35,  8.16s/it]  6%|████████████▏                                                                                                                                                                                     | 355/5680 [50:01<12:03:00,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.7835', 'grad_norm': '0.2296', 'learning_rate': '0.0001981', 'ppl': '2.189', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 2908160, 'tokens/trainable': 2892527, 'epoch': '0.0625'}
  6%|████████████▏                                                                                                                                                                                     | 355/5680 [50:01<12:03:00,  8.15s/it]  6%|████████████▏                                                                                                                                                                                     | 356/5680 [50:09<12:01:12,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.8877', 'grad_norm': '0.2936', 'learning_rate': '0.0001981', 'ppl': '2.429', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.3', 'tokens/total': 2916352, 'tokens/trainable': 2900595, 'epoch': '0.06268'}
  6%|████████████▏                                                                                                                                                                                     | 356/5680 [50:09<12:01:12,  8.13s/it]  6%|████████████▏                                                                                                                                                                                     | 357/5680 [50:17<12:02:01,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.6103', 'grad_norm': '0.1942', 'learning_rate': '0.0001981', 'ppl': '1.841', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2924544, 'tokens/trainable': 2908757, 'epoch': '0.06285'}
  6%|████████████▏                                                                                                                                                                                     | 357/5680 [50:17<12:02:01,  8.14s/it]  6%|████████████▏                                                                                                                                                                                     | 358/5680 [50:25<12:01:32,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.986', 'grad_norm': '0.2319', 'learning_rate': '0.0001981', 'ppl': '2.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 2932736, 'tokens/trainable': 2916942, 'epoch': '0.06303'}
  6%|████████████▏                                                                                                                                                                                     | 358/5680 [50:25<12:01:32,  8.13s/it]  6%|████████████▎                                                                                                                                                                                     | 359/5680 [50:33<12:00:22,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.719', 'grad_norm': '0.2516', 'learning_rate': '0.000198', 'ppl': '2.052', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 2940928, 'tokens/trainable': 2925046, 'epoch': '0.0632'}
  6%|████████████▎                                                                                                                                                                                     | 359/5680 [50:33<12:00:22,  8.12s/it]  6%|████████████▎                                                                                                                                                                                     | 360/5680 [50:41<11:58:37,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.8962', 'grad_norm': '0.2346', 'learning_rate': '0.000198', 'ppl': '2.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 2949120, 'tokens/trainable': 2933162, 'epoch': '0.06338'}
  6%|████████████▎                                                                                                                                                                                     | 360/5680 [50:41<11:58:37,  8.10s/it]  6%|████████████▎                                                                                                                                                                                     | 361/5680 [50:49<11:56:14,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.8157', 'grad_norm': '0.2225', 'learning_rate': '0.000198', 'ppl': '2.261', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 2957312, 'tokens/trainable': 2941271, 'epoch': '0.06356'}
  6%|████████████▎                                                                                                                                                                                     | 361/5680 [50:49<11:56:14,  8.08s/it]  6%|████████████▎                                                                                                                                                                                     | 362/5680 [50:57<11:53:26,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.8528', 'grad_norm': '0.2155', 'learning_rate': '0.000198', 'ppl': '2.346', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 2965504, 'tokens/trainable': 2949433, 'epoch': '0.06373'}
  6%|████████████▎                                                                                                                                                                                     | 362/5680 [50:57<11:53:26,  8.05s/it]  6%|████████████▍                                                                                                                                                                                     | 363/5680 [51:06<11:55:38,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.5989', 'grad_norm': '0.1912', 'learning_rate': '0.000198', 'ppl': '1.82', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 2973696, 'tokens/trainable': 2957584, 'epoch': '0.06391'}
  6%|████████████▍                                                                                                                                                                                     | 363/5680 [51:06<11:55:38,  8.08s/it]  6%|████████████▍                                                                                                                                                                                     | 364/5680 [51:14<11:55:39,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.61', 'grad_norm': '0.2449', 'learning_rate': '0.000198', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 2981888, 'tokens/trainable': 2965741, 'epoch': '0.06408'}
  6%|████████████▍                                                                                                                                                                                     | 364/5680 [51:14<11:55:39,  8.08s/it]  6%|████████████▍                                                                                                                                                                                     | 365/5680 [51:22<11:55:53,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '1.246', 'grad_norm': '0.2979', 'learning_rate': '0.000198', 'ppl': '3.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 2990080, 'tokens/trainable': 2973912, 'epoch': '0.06426'}
  6%|████████████▍                                                                                                                                                                                     | 365/5680 [51:22<11:55:53,  8.08s/it]  6%|████████████▌                                                                                                                                                                                     | 366/5680 [51:30<11:56:03,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.8371', 'grad_norm': '0.2385', 'learning_rate': '0.000198', 'ppl': '2.31', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 2998272, 'tokens/trainable': 2982081, 'epoch': '0.06444'}
  6%|████████████▌                                                                                                                                                                                     | 366/5680 [51:30<11:56:03,  8.09s/it]  6%|████████████▌                                                                                                                                                                                     | 367/5680 [51:38<11:55:11,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.8651', 'grad_norm': '0.2256', 'learning_rate': '0.000198', 'ppl': '2.375', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3006464, 'tokens/trainable': 2990231, 'epoch': '0.06461'}
  6%|████████████▌                                                                                                                                                                                     | 367/5680 [51:38<11:55:11,  8.08s/it]  6%|████████████▌                                                                                                                                                                                     | 368/5680 [51:46<11:53:44,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9398', 'grad_norm': '0.2523', 'learning_rate': '0.0001979', 'ppl': '2.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 3014656, 'tokens/trainable': 2998373, 'epoch': '0.06479'}
  6%|████████████▌                                                                                                                                                                                     | 368/5680 [51:46<11:53:44,  8.06s/it]  6%|████████████▌                                                                                                                                                                                     | 369/5680 [51:54<11:53:05,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.5914', 'grad_norm': '0.2053', 'learning_rate': '0.0001979', 'ppl': '1.807', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 3022848, 'tokens/trainable': 3006497, 'epoch': '0.06496'}
  6%|████████████▌                                                                                                                                                                                     | 369/5680 [51:54<11:53:05,  8.06s/it]  7%|████████████▋                                                                                                                                                                                     | 370/5680 [52:02<11:57:28,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.654', 'grad_norm': '0.2083', 'learning_rate': '0.0001979', 'ppl': '1.923', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.2', 'tokens/total': 3031040, 'tokens/trainable': 3014633, 'epoch': '0.06514'}
  7%|████████████▋                                                                                                                                                                                     | 370/5680 [52:02<11:57:28,  8.11s/it]  7%|████████████▋                                                                                                                                                                                     | 371/5680 [52:10<11:59:16,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.9227', 'grad_norm': '0.2309', 'learning_rate': '0.0001979', 'ppl': '2.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.1', 'tokens/total': 3039232, 'tokens/trainable': 3022767, 'epoch': '0.06532'}
  7%|████████████▋                                                                                                                                                                                     | 371/5680 [52:10<11:59:16,  8.13s/it]  7%|████████████▋                                                                                                                                                                                     | 372/5680 [52:18<11:57:24,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '1.065', 'grad_norm': '0.3078', 'learning_rate': '0.0001979', 'ppl': '2.9', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 3047424, 'tokens/trainable': 3030908, 'epoch': '0.06549'}
  7%|████████████▋                                                                                                                                                                                     | 372/5680 [52:18<11:57:24,  8.11s/it]  7%|████████████▋                                                                                                                                                                                     | 373/5680 [52:27<11:55:16,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.7728', 'grad_norm': '0.2189', 'learning_rate': '0.0001979', 'ppl': '2.166', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 3055616, 'tokens/trainable': 3039029, 'epoch': '0.06567'}
  7%|████████████▋                                                                                                                                                                                     | 373/5680 [52:27<11:55:16,  8.09s/it]  7%|████████████▊                                                                                                                                                                                     | 374/5680 [52:35<11:54:24,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.8224', 'grad_norm': '0.2184', 'learning_rate': '0.0001979', 'ppl': '2.276', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 3063808, 'tokens/trainable': 3047190, 'epoch': '0.06585'}
  7%|████████████▊                                                                                                                                                                                     | 374/5680 [52:35<11:54:24,  8.08s/it]  7%|████████████▊                                                                                                                                                                                     | 375/5680 [52:43<11:54:12,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.9099', 'grad_norm': '0.2271', 'learning_rate': '0.0001979', 'ppl': '2.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 3072000, 'tokens/trainable': 3055349, 'epoch': '0.06602'}
  7%|████████████▊                                                                                                                                                                                     | 375/5680 [52:43<11:54:12,  8.08s/it]  7%|████████████▊                                                                                                                                                                                     | 376/5680 [52:51<11:53:19,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6634', 'grad_norm': '0.2007', 'learning_rate': '0.0001979', 'ppl': '1.941', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 3080192, 'tokens/trainable': 3063447, 'epoch': '0.0662'}
  7%|████████████▊                                                                                                                                                                                     | 376/5680 [52:51<11:53:19,  8.07s/it]  7%|████████████▉                                                                                                                                                                                     | 377/5680 [52:59<11:51:49,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5052', 'grad_norm': '0.1784', 'learning_rate': '0.0001978', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 3088384, 'tokens/trainable': 3071619, 'epoch': '0.06637'}
  7%|████████████▉                                                                                                                                                                                     | 377/5680 [52:59<11:51:49,  8.05s/it]  7%|████████████▉                                                                                                                                                                                     | 378/5680 [53:07<11:55:07,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.5704', 'grad_norm': '0.2011', 'learning_rate': '0.0001978', 'ppl': '1.769', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.2', 'tokens/total': 3096576, 'tokens/trainable': 3079712, 'epoch': '0.06655'}
  7%|████████████▉                                                                                                                                                                                     | 378/5680 [53:07<11:55:07,  8.09s/it]  7%|████████████▉                                                                                                                                                                                     | 379/5680 [53:15<11:56:57,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.9089', 'grad_norm': '0.2474', 'learning_rate': '0.0001978', 'ppl': '2.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.2', 'tokens/total': 3104768, 'tokens/trainable': 3087846, 'epoch': '0.06673'}
  7%|████████████▉                                                                                                                                                                                     | 379/5680 [53:15<11:56:57,  8.11s/it]  7%|████████████▉                                                                                                                                                                                     | 380/5680 [53:23<11:58:50,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.8793', 'grad_norm': '0.2158', 'learning_rate': '0.0001978', 'ppl': '2.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 3112960, 'tokens/trainable': 3096036, 'epoch': '0.0669'}
  7%|████████████▉                                                                                                                                                                                     | 380/5680 [53:23<11:58:50,  8.14s/it]  7%|█████████████                                                                                                                                                                                     | 381/5680 [53:31<12:00:04,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.5947', 'grad_norm': '0.1827', 'learning_rate': '0.0001978', 'ppl': '1.812', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.5', 'tokens/total': 3121152, 'tokens/trainable': 3104153, 'epoch': '0.06708'}
  7%|█████████████                                                                                                                                                                                     | 381/5680 [53:31<12:00:04,  8.15s/it]  7%|█████████████                                                                                                                                                                                     | 382/5680 [53:40<12:01:36,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7356', 'grad_norm': '0.2156', 'learning_rate': '0.0001978', 'ppl': '2.087', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.1', 'tokens/total': 3129344, 'tokens/trainable': 3112308, 'epoch': '0.06725'}
  7%|█████████████                                                                                                                                                                                     | 382/5680 [53:40<12:01:36,  8.17s/it]  7%|█████████████                                                                                                                                                                                     | 383/5680 [53:48<12:01:08,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.5906', 'grad_norm': '0.1924', 'learning_rate': '0.0001978', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 3137536, 'tokens/trainable': 3120465, 'epoch': '0.06743'}
  7%|█████████████                                                                                                                                                                                     | 383/5680 [53:48<12:01:08,  8.17s/it]  7%|█████████████                                                                                                                                                                                     | 384/5680 [53:56<11:58:55,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.7447', 'grad_norm': '0.2191', 'learning_rate': '0.0001978', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 3145728, 'tokens/trainable': 3128555, 'epoch': '0.06761'}
  7%|█████████████                                                                                                                                                                                     | 384/5680 [53:56<11:58:55,  8.14s/it]  7%|█████████████▏                                                                                                                                                                                    | 385/5680 [54:04<11:55:05,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.9233', 'grad_norm': '0.2315', 'learning_rate': '0.0001978', 'ppl': '2.518', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 3153920, 'tokens/trainable': 3136675, 'epoch': '0.06778'}
  7%|█████████████▏                                                                                                                                                                                    | 385/5680 [54:04<11:55:05,  8.10s/it]  7%|█████████████▏                                                                                                                                                                                    | 386/5680 [54:12<11:52:16,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.8294', 'grad_norm': '0.2258', 'learning_rate': '0.0001977', 'ppl': '2.292', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 3162112, 'tokens/trainable': 3144819, 'epoch': '0.06796'}
  7%|█████████████▏                                                                                                                                                                                    | 386/5680 [54:12<11:52:16,  8.07s/it]  7%|█████████████▏                                                                                                                                                                                    | 387/5680 [54:20<11:50:52,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9146', 'grad_norm': '0.2595', 'learning_rate': '0.0001977', 'ppl': '2.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3170304, 'tokens/trainable': 3152941, 'epoch': '0.06813'}
  7%|█████████████▏                                                                                                                                                                                    | 387/5680 [54:20<11:50:52,  8.06s/it]  7%|█████████████▎                                                                                                                                                                                    | 388/5680 [54:28<11:50:19,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.9176', 'grad_norm': '0.228', 'learning_rate': '0.0001977', 'ppl': '2.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3178496, 'tokens/trainable': 3161080, 'epoch': '0.06831'}
  7%|█████████████▎                                                                                                                                                                                    | 388/5680 [54:28<11:50:19,  8.05s/it]  7%|█████████████▎                                                                                                                                                                                    | 389/5680 [54:36<11:48:57,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.8894', 'grad_norm': '0.2329', 'learning_rate': '0.0001977', 'ppl': '2.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3186688, 'tokens/trainable': 3169180, 'epoch': '0.06849'}
  7%|█████████████▎                                                                                                                                                                                    | 389/5680 [54:36<11:48:57,  8.04s/it]  7%|█████████████▎                                                                                                                                                                                    | 390/5680 [54:44<11:47:38,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5079', 'grad_norm': '0.2018', 'learning_rate': '0.0001977', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 3194880, 'tokens/trainable': 3177353, 'epoch': '0.06866'}
  7%|█████████████▎                                                                                                                                                                                    | 390/5680 [54:44<11:47:38,  8.03s/it]  7%|█████████████▎                                                                                                                                                                                    | 391/5680 [54:52<11:46:18,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7469', 'grad_norm': '0.2233', 'learning_rate': '0.0001977', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 3203072, 'tokens/trainable': 3185507, 'epoch': '0.06884'}
  7%|█████████████▎                                                                                                                                                                                    | 391/5680 [54:52<11:46:18,  8.01s/it]  7%|█████████████▍                                                                                                                                                                                    | 392/5680 [55:00<11:46:16,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6294', 'grad_norm': '0.1868', 'learning_rate': '0.0001977', 'ppl': '1.876', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 3211264, 'tokens/trainable': 3193657, 'epoch': '0.06901'}
  7%|█████████████▍                                                                                                                                                                                    | 392/5680 [55:00<11:46:16,  8.01s/it]  7%|█████████████▍                                                                                                                                                                                    | 393/5680 [55:08<11:43:44,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7714', 'grad_norm': '0.2098', 'learning_rate': '0.0001977', 'ppl': '2.163', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 3219456, 'tokens/trainable': 3201773, 'epoch': '0.06919'}
  7%|█████████████▍                                                                                                                                                                                    | 393/5680 [55:08<11:43:44,  7.99s/it]  7%|█████████████▍                                                                                                                                                                                    | 394/5680 [55:16<11:41:31,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.9827', 'grad_norm': '0.235', 'learning_rate': '0.0001976', 'ppl': '2.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 3227648, 'tokens/trainable': 3209952, 'epoch': '0.06937'}
  7%|█████████████▍                                                                                                                                                                                    | 394/5680 [55:16<11:41:31,  7.96s/it]  7%|█████████████▍                                                                                                                                                                                    | 395/5680 [55:24<11:40:03,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6872', 'grad_norm': '0.2064', 'learning_rate': '0.0001976', 'ppl': '1.988', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 3235840, 'tokens/trainable': 3218121, 'epoch': '0.06954'}
  7%|█████████████▍                                                                                                                                                                                    | 395/5680 [55:24<11:40:03,  7.95s/it]  7%|█████████████▌                                                                                                                                                                                    | 396/5680 [55:32<11:38:42,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7191', 'grad_norm': '0.2015', 'learning_rate': '0.0001976', 'ppl': '2.053', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 3244032, 'tokens/trainable': 3226286, 'epoch': '0.06972'}
  7%|█████████████▌                                                                                                                                                                                    | 396/5680 [55:32<11:38:42,  7.93s/it]  7%|█████████████▌                                                                                                                                                                                    | 397/5680 [55:40<11:40:35,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7491', 'grad_norm': '0.2084', 'learning_rate': '0.0001976', 'ppl': '2.115', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 3252224, 'tokens/trainable': 3234413, 'epoch': '0.06989'}
  7%|█████████████▌                                                                                                                                                                                    | 397/5680 [55:40<11:40:35,  7.96s/it]  7%|█████████████▌                                                                                                                                                                                    | 398/5680 [55:48<11:42:30,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6925', 'grad_norm': '0.2316', 'learning_rate': '0.0001976', 'ppl': '1.999', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 3260416, 'tokens/trainable': 3242564, 'epoch': '0.07007'}
  7%|█████████████▌                                                                                                                                                                                    | 398/5680 [55:48<11:42:30,  7.98s/it]  7%|█████████████▋                                                                                                                                                                                    | 399/5680 [55:56<11:44:00,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6229', 'grad_norm': '0.1914', 'learning_rate': '0.0001976', 'ppl': '1.864', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 3268608, 'tokens/trainable': 3250744, 'epoch': '0.07025'}
  7%|█████████████▋                                                                                                                                                                                    | 399/5680 [55:56<11:44:00,  8.00s/it]  7%|█████████████▋                                                                                                                                                                                    | 400/5680 [56:04<11:45:05,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.811', 'grad_norm': '0.2257', 'learning_rate': '0.0001976', 'ppl': '2.25', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 3276800, 'tokens/trainable': 3258918, 'epoch': '0.07042'}
  7%|█████████████▋                                                                                                                                                                                    | 400/5680 [56:04<11:45:05,  8.01s/it]  7%|█████████████▋                                                                                                                                                                                    | 401/5680 [56:12<11:46:27,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.8066', 'grad_norm': '0.2092', 'learning_rate': '0.0001976', 'ppl': '2.24', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 3284992, 'tokens/trainable': 3267061, 'epoch': '0.0706'}
  7%|█████████████▋                                                                                                                                                                                    | 401/5680 [56:12<11:46:27,  8.03s/it]  7%|█████████████▋                                                                                                                                                                                    | 402/5680 [56:20<11:46:57,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.748', 'grad_norm': '0.216', 'learning_rate': '0.0001976', 'ppl': '2.113', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 3293184, 'tokens/trainable': 3275193, 'epoch': '0.07077'}
  7%|█████████████▋                                                                                                                                                                                    | 402/5680 [56:20<11:46:57,  8.04s/it]  7%|█████████████▊                                                                                                                                                                                    | 403/5680 [56:28<11:47:41,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5092', 'grad_norm': '0.2046', 'learning_rate': '0.0001975', 'ppl': '1.664', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 3301376, 'tokens/trainable': 3283298, 'epoch': '0.07095'}
  7%|█████████████▊                                                                                                                                                                                    | 403/5680 [56:28<11:47:41,  8.05s/it]  7%|█████████████▊                                                                                                                                                                                    | 404/5680 [56:36<11:48:00,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.8098', 'grad_norm': '0.2193', 'learning_rate': '0.0001975', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 3309568, 'tokens/trainable': 3291402, 'epoch': '0.07113'}
  7%|█████████████▊                                                                                                                                                                                    | 404/5680 [56:36<11:48:00,  8.05s/it]  7%|█████████████▊                                                                                                                                                                                    | 405/5680 [56:44<11:47:38,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5772', 'grad_norm': '0.1892', 'learning_rate': '0.0001975', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 3317760, 'tokens/trainable': 3299499, 'epoch': '0.0713'}
  7%|█████████████▊                                                                                                                                                                                    | 405/5680 [56:44<11:47:38,  8.05s/it]  7%|█████████████▊                                                                                                                                                                                    | 406/5680 [56:52<11:47:43,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5907', 'grad_norm': '0.1973', 'learning_rate': '0.0001975', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3325952, 'tokens/trainable': 3307647, 'epoch': '0.07148'}
  7%|█████████████▊                                                                                                                                                                                    | 406/5680 [56:52<11:47:43,  8.05s/it]  7%|█████████████▉                                                                                                                                                                                    | 407/5680 [57:00<11:46:26,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.6533', 'grad_norm': '0.2325', 'learning_rate': '0.0001975', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 3334144, 'tokens/trainable': 3315806, 'epoch': '0.07165'}
  7%|█████████████▉                                                                                                                                                                                    | 407/5680 [57:00<11:46:26,  8.04s/it]  7%|█████████████▉                                                                                                                                                                                    | 408/5680 [57:08<11:46:36,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.6786', 'grad_norm': '0.215', 'learning_rate': '0.0001975', 'ppl': '1.971', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3342336, 'tokens/trainable': 3323953, 'epoch': '0.07183'}
  7%|█████████████▉                                                                                                                                                                                    | 408/5680 [57:08<11:46:36,  8.04s/it]  7%|█████████████▉                                                                                                                                                                                    | 409/5680 [57:16<11:49:19,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6058', 'grad_norm': '0.1889', 'learning_rate': '0.0001975', 'ppl': '1.833', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 3350528, 'tokens/trainable': 3332119, 'epoch': '0.07201'}
  7%|█████████████▉                                                                                                                                                                                    | 409/5680 [57:16<11:49:19,  8.07s/it]  7%|██████████████                                                                                                                                                                                    | 410/5680 [57:24<11:49:20,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.8551', 'grad_norm': '0.2569', 'learning_rate': '0.0001975', 'ppl': '2.352', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.5', 'tokens/total': 3358720, 'tokens/trainable': 3340192, 'epoch': '0.07218'}
  7%|██████████████                                                                                                                                                                                    | 410/5680 [57:24<11:49:20,  8.08s/it]  7%|██████████████                                                                                                                                                                                    | 411/5680 [57:32<11:48:36,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.7661', 'grad_norm': '0.2158', 'learning_rate': '0.0001974', 'ppl': '2.151', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 3366912, 'tokens/trainable': 3348281, 'epoch': '0.07236'}
  7%|██████████████                                                                                                                                                                                    | 411/5680 [57:32<11:48:36,  8.07s/it]  7%|██████████████                                                                                                                                                                                    | 412/5680 [57:41<11:48:15,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.7595', 'grad_norm': '0.2194', 'learning_rate': '0.0001974', 'ppl': '2.137', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 3375104, 'tokens/trainable': 3356455, 'epoch': '0.07254'}
  7%|██████████████                                                                                                                                                                                    | 412/5680 [57:41<11:48:15,  8.07s/it]  7%|██████████████                                                                                                                                                                                    | 413/5680 [57:49<11:47:29,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '1.212', 'grad_norm': '0.2596', 'learning_rate': '0.0001974', 'ppl': '3.36', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 3383296, 'tokens/trainable': 3364597, 'epoch': '0.07271'}
  7%|██████████████                                                                                                                                                                                    | 413/5680 [57:49<11:47:29,  8.06s/it]  7%|██████████████▏                                                                                                                                                                                   | 414/5680 [57:57<11:47:09,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7445', 'grad_norm': '0.2239', 'learning_rate': '0.0001974', 'ppl': '2.105', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 3391488, 'tokens/trainable': 3372787, 'epoch': '0.07289'}
  7%|██████████████▏                                                                                                                                                                                   | 414/5680 [57:57<11:47:09,  8.06s/it]  7%|██████████████▏                                                                                                                                                                                   | 415/5680 [58:05<11:46:13,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.9396', 'grad_norm': '0.2231', 'learning_rate': '0.0001974', 'ppl': '2.559', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 3399680, 'tokens/trainable': 3380955, 'epoch': '0.07306'}
  7%|██████████████▏                                                                                                                                                                                   | 415/5680 [58:05<11:46:13,  8.05s/it]  7%|██████████████▏                                                                                                                                                                                   | 416/5680 [58:13<11:46:28,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.6222', 'grad_norm': '0.2265', 'learning_rate': '0.0001974', 'ppl': '1.863', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 3407872, 'tokens/trainable': 3389101, 'epoch': '0.07324'}
  7%|██████████████▏                                                                                                                                                                                   | 416/5680 [58:13<11:46:28,  8.05s/it]  7%|██████████████▏                                                                                                                                                                                   | 417/5680 [58:21<11:44:06,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.8344', 'grad_norm': '0.2287', 'learning_rate': '0.0001974', 'ppl': '2.303', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 3416064, 'tokens/trainable': 3397236, 'epoch': '0.07342'}
  7%|██████████████▏                                                                                                                                                                                   | 417/5680 [58:21<11:44:06,  8.03s/it]  7%|██████████████▎                                                                                                                                                                                   | 418/5680 [58:29<11:40:32,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '1.176', 'grad_norm': '0.2749', 'learning_rate': '0.0001974', 'ppl': '3.241', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 3424256, 'tokens/trainable': 3405354, 'epoch': '0.07359'}
  7%|██████████████▎                                                                                                                                                                                   | 418/5680 [58:29<11:40:32,  7.99s/it]  7%|██████████████▎                                                                                                                                                                                   | 419/5680 [58:36<11:37:43,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8733', 'grad_norm': '0.2247', 'learning_rate': '0.0001973', 'ppl': '2.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 3432448, 'tokens/trainable': 3413486, 'epoch': '0.07377'}
  7%|██████████████▎                                                                                                                                                                                   | 419/5680 [58:36<11:37:43,  7.96s/it]  7%|██████████████▎                                                                                                                                                                                   | 420/5680 [58:45<11:44:51,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.6664', 'grad_norm': '0.2181', 'learning_rate': '0.0001973', 'ppl': '1.947', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.3', 'tokens/total': 3440640, 'tokens/trainable': 3421629, 'epoch': '0.07394'}
  7%|██████████████▎                                                                                                                                                                                   | 420/5680 [58:45<11:44:51,  8.04s/it]  7%|██████████████▍                                                                                                                                                                                   | 421/5680 [58:53<11:40:48,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '1.063', 'grad_norm': '0.2644', 'learning_rate': '0.0001973', 'ppl': '2.896', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3448832, 'tokens/trainable': 3429729, 'epoch': '0.07412'}
  7%|██████████████▍                                                                                                                                                                                   | 421/5680 [58:53<11:40:48,  8.00s/it]  7%|██████████████▍                                                                                                                                                                                   | 422/5680 [59:00<11:38:46,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4679', 'grad_norm': '0.1798', 'learning_rate': '0.0001973', 'ppl': '1.597', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 3457024, 'tokens/trainable': 3437890, 'epoch': '0.0743'}
  7%|██████████████▍                                                                                                                                                                                   | 422/5680 [59:00<11:38:46,  7.97s/it]  7%|██████████████▍                                                                                                                                                                                   | 423/5680 [59:08<11:36:55,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7908', 'grad_norm': '0.2204', 'learning_rate': '0.0001973', 'ppl': '2.205', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 3465216, 'tokens/trainable': 3445981, 'epoch': '0.07447'}
  7%|██████████████▍                                                                                                                                                                                   | 423/5680 [59:08<11:36:55,  7.95s/it]  7%|██████████████▍                                                                                                                                                                                   | 424/5680 [59:16<11:36:21,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8307', 'grad_norm': '0.2249', 'learning_rate': '0.0001973', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 3473408, 'tokens/trainable': 3454141, 'epoch': '0.07465'}
  7%|██████████████▍                                                                                                                                                                                   | 424/5680 [59:16<11:36:21,  7.95s/it]  7%|██████████████▌                                                                                                                                                                                   | 425/5680 [59:24<11:35:42,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8245', 'grad_norm': '0.2173', 'learning_rate': '0.0001973', 'ppl': '2.281', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 3481600, 'tokens/trainable': 3462240, 'epoch': '0.07482'}
  7%|██████████████▌                                                                                                                                                                                   | 425/5680 [59:24<11:35:42,  7.94s/it]  8%|██████████████▌                                                                                                                                                                                   | 426/5680 [59:32<11:35:28,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7812', 'grad_norm': '0.2396', 'learning_rate': '0.0001972', 'ppl': '2.184', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3489792, 'tokens/trainable': 3470391, 'epoch': '0.075'}
  8%|██████████████▌                                                                                                                                                                                   | 426/5680 [59:32<11:35:28,  7.94s/it]  8%|██████████████▌                                                                                                                                                                                   | 427/5680 [59:40<11:32:43,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5567', 'grad_norm': '0.1842', 'learning_rate': '0.0001972', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 3497984, 'tokens/trainable': 3478480, 'epoch': '0.07518'}
  8%|██████████████▌                                                                                                                                                                                   | 427/5680 [59:40<11:32:43,  7.91s/it]  8%|██████████████▌                                                                                                                                                                                   | 428/5680 [59:48<11:32:26,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.9441', 'grad_norm': '0.2421', 'learning_rate': '0.0001972', 'ppl': '2.57', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3506176, 'tokens/trainable': 3486598, 'epoch': '0.07535'}
  8%|██████████████▌                                                                                                                                                                                   | 428/5680 [59:48<11:32:26,  7.91s/it]  8%|██████████████▋                                                                                                                                                                                   | 429/5680 [59:56<11:30:47,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.9689', 'grad_norm': '0.2227', 'learning_rate': '0.0001972', 'ppl': '2.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 3514368, 'tokens/trainable': 3494746, 'epoch': '0.07553'}
  8%|██████████████▋                                                                                                                                                                                   | 429/5680 [59:56<11:30:47,  7.89s/it]  8%|██████████████▌                                                                                                                                                                                 | 430/5680 [1:00:04<11:30:21,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.066', 'grad_norm': '0.2779', 'learning_rate': '0.0001972', 'ppl': '2.905', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 3522560, 'tokens/trainable': 3502871, 'epoch': '0.0757'}
  8%|██████████████▌                                                                                                                                                                                 | 430/5680 [1:00:04<11:30:21,  7.89s/it]  8%|██████████████▌                                                                                                                                                                                 | 431/5680 [1:00:12<11:31:06,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5707', 'grad_norm': '0.1864', 'learning_rate': '0.0001972', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 3530752, 'tokens/trainable': 3511032, 'epoch': '0.07588'}
  8%|██████████████▌                                                                                                                                                                                 | 431/5680 [1:00:12<11:31:06,  7.90s/it]  8%|██████████████▌                                                                                                                                                                                 | 432/5680 [1:00:20<11:31:53,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6322', 'grad_norm': '0.2009', 'learning_rate': '0.0001972', 'ppl': '1.882', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 3538944, 'tokens/trainable': 3519194, 'epoch': '0.07606'}
  8%|██████████████▌                                                                                                                                                                                 | 432/5680 [1:00:20<11:31:53,  7.91s/it]  8%|██████████████▋                                                                                                                                                                                 | 433/5680 [1:00:28<11:49:04,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.4964', 'grad_norm': '0.186', 'learning_rate': '0.0001972', 'ppl': '1.643', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '947.1', 'tokens/total': 3547136, 'tokens/trainable': 3527309, 'epoch': '0.07623'}
  8%|██████████████▋                                                                                                                                                                                 | 433/5680 [1:00:28<11:49:04,  8.11s/it]  8%|██████████████▋                                                                                                                                                                                 | 434/5680 [1:00:36<11:43:35,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.7453', 'grad_norm': '0.2173', 'learning_rate': '0.0001971', 'ppl': '2.107', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 3555328, 'tokens/trainable': 3535372, 'epoch': '0.07641'}
  8%|██████████████▋                                                                                                                                                                                 | 434/5680 [1:00:36<11:43:35,  8.05s/it]  8%|██████████████▋                                                                                                                                                                                 | 435/5680 [1:00:44<11:39:34,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.7847', 'grad_norm': '0.2368', 'learning_rate': '0.0001971', 'ppl': '2.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 3563520, 'tokens/trainable': 3543525, 'epoch': '0.07658'}
  8%|██████████████▋                                                                                                                                                                                 | 435/5680 [1:00:44<11:39:34,  8.00s/it]  8%|██████████████▋                                                                                                                                                                                 | 436/5680 [1:00:52<11:36:29,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7018', 'grad_norm': '0.2265', 'learning_rate': '0.0001971', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 3571712, 'tokens/trainable': 3551602, 'epoch': '0.07676'}
  8%|██████████████▋                                                                                                                                                                                 | 436/5680 [1:00:52<11:36:29,  7.97s/it]  8%|██████████████▊                                                                                                                                                                                 | 437/5680 [1:01:00<11:34:55,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6688', 'grad_norm': '0.2201', 'learning_rate': '0.0001971', 'ppl': '1.952', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3579904, 'tokens/trainable': 3559725, 'epoch': '0.07694'}
  8%|██████████████▊                                                                                                                                                                                 | 437/5680 [1:01:00<11:34:55,  7.95s/it]  8%|██████████████▊                                                                                                                                                                                 | 438/5680 [1:01:08<11:33:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4176', 'grad_norm': '0.1702', 'learning_rate': '0.0001971', 'ppl': '1.518', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 3588096, 'tokens/trainable': 3567860, 'epoch': '0.07711'}
  8%|██████████████▊                                                                                                                                                                                 | 438/5680 [1:01:08<11:33:47,  7.94s/it]  8%|██████████████▊                                                                                                                                                                                 | 439/5680 [1:01:16<11:32:50,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5489', 'grad_norm': '0.2161', 'learning_rate': '0.0001971', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3596288, 'tokens/trainable': 3575985, 'epoch': '0.07729'}
  8%|██████████████▊                                                                                                                                                                                 | 439/5680 [1:01:16<11:32:50,  7.93s/it]  8%|██████████████▊                                                                                                                                                                                 | 440/5680 [1:01:23<11:32:04,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '1.069', 'grad_norm': '0.2561', 'learning_rate': '0.0001971', 'ppl': '2.912', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3604480, 'tokens/trainable': 3584102, 'epoch': '0.07746'}
  8%|██████████████▊                                                                                                                                                                                 | 440/5680 [1:01:23<11:32:04,  7.92s/it]  8%|██████████████▉                                                                                                                                                                                 | 441/5680 [1:01:31<11:31:47,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8688', 'grad_norm': '0.2312', 'learning_rate': '0.0001971', 'ppl': '2.384', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 3612672, 'tokens/trainable': 3592286, 'epoch': '0.07764'}
  8%|██████████████▉                                                                                                                                                                                 | 441/5680 [1:01:31<11:31:47,  7.92s/it]  8%|██████████████▉                                                                                                                                                                                 | 442/5680 [1:01:39<11:31:44,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9059', 'grad_norm': '0.2317', 'learning_rate': '0.000197', 'ppl': '2.474', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 3620864, 'tokens/trainable': 3600419, 'epoch': '0.07782'}
  8%|██████████████▉                                                                                                                                                                                 | 442/5680 [1:01:39<11:31:44,  7.92s/it]  8%|██████████████▉                                                                                                                                                                                 | 443/5680 [1:01:47<11:31:49,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.8413', 'grad_norm': '0.2306', 'learning_rate': '0.000197', 'ppl': '2.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 3629056, 'tokens/trainable': 3608514, 'epoch': '0.07799'}
  8%|██████████████▉                                                                                                                                                                                 | 443/5680 [1:01:47<11:31:49,  7.93s/it]  8%|███████████████                                                                                                                                                                                 | 444/5680 [1:01:55<11:31:44,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '1.062', 'grad_norm': '0.2812', 'learning_rate': '0.000197', 'ppl': '2.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 3637248, 'tokens/trainable': 3616622, 'epoch': '0.07817'}
  8%|███████████████                                                                                                                                                                                 | 444/5680 [1:01:55<11:31:44,  7.93s/it]  8%|███████████████                                                                                                                                                                                 | 445/5680 [1:02:03<11:30:20,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6799', 'grad_norm': '0.2111', 'learning_rate': '0.000197', 'ppl': '1.974', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 3645440, 'tokens/trainable': 3624775, 'epoch': '0.07835'}
  8%|███████████████                                                                                                                                                                                 | 445/5680 [1:02:03<11:30:20,  7.91s/it]  8%|███████████████                                                                                                                                                                                 | 446/5680 [1:02:11<11:29:21,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7006', 'grad_norm': '0.2112', 'learning_rate': '0.000197', 'ppl': '2.015', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 3653632, 'tokens/trainable': 3632927, 'epoch': '0.07852'}
  8%|███████████████                                                                                                                                                                                 | 446/5680 [1:02:11<11:29:21,  7.90s/it]  8%|███████████████                                                                                                                                                                                 | 447/5680 [1:02:19<11:28:59,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8113', 'grad_norm': '0.2197', 'learning_rate': '0.000197', 'ppl': '2.251', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 3661824, 'tokens/trainable': 3641070, 'epoch': '0.0787'}
  8%|███████████████                                                                                                                                                                                 | 447/5680 [1:02:19<11:28:59,  7.90s/it]  8%|███████████████▏                                                                                                                                                                                | 448/5680 [1:02:27<11:28:45,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.9821', 'grad_norm': '0.2439', 'learning_rate': '0.000197', 'ppl': '2.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 3670016, 'tokens/trainable': 3649184, 'epoch': '0.07887'}
  8%|███████████████▏                                                                                                                                                                                | 448/5680 [1:02:27<11:28:45,  7.90s/it]  8%|███████████████▏                                                                                                                                                                                | 449/5680 [1:02:35<11:29:40,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7384', 'grad_norm': '0.2364', 'learning_rate': '0.0001969', 'ppl': '2.093', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 3678208, 'tokens/trainable': 3657345, 'epoch': '0.07905'}
  8%|███████████████▏                                                                                                                                                                                | 449/5680 [1:02:35<11:29:40,  7.91s/it]  8%|███████████████▏                                                                                                                                                                                | 450/5680 [1:02:43<11:29:57,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5839', 'grad_norm': '0.1968', 'learning_rate': '0.0001969', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 3686400, 'tokens/trainable': 3665460, 'epoch': '0.07923'}
  8%|███████████████▏                                                                                                                                                                                | 450/5680 [1:02:43<11:29:57,  7.92s/it]  8%|███████████████▏                                                                                                                                                                                | 451/5680 [1:02:51<11:30:23,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.738', 'grad_norm': '0.2099', 'learning_rate': '0.0001969', 'ppl': '2.092', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 3694592, 'tokens/trainable': 3673623, 'epoch': '0.0794'}
  8%|███████████████▏                                                                                                                                                                                | 451/5680 [1:02:51<11:30:23,  7.92s/it]  8%|███████████████▎                                                                                                                                                                                | 452/5680 [1:02:59<11:31:50,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.171', 'grad_norm': '0.2813', 'learning_rate': '0.0001969', 'ppl': '3.225', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 3702784, 'tokens/trainable': 3681728, 'epoch': '0.07958'}
  8%|███████████████▎                                                                                                                                                                                | 452/5680 [1:02:59<11:31:50,  7.94s/it]  8%|███████████████▎                                                                                                                                                                                | 453/5680 [1:03:06<11:32:34,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '1.648', 'grad_norm': '0.2654', 'learning_rate': '0.0001969', 'ppl': '5.197', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 3710976, 'tokens/trainable': 3689822, 'epoch': '0.07975'}
  8%|███████████████▎                                                                                                                                                                                | 453/5680 [1:03:06<11:32:34,  7.95s/it]  8%|███████████████▎                                                                                                                                                                                | 454/5680 [1:03:14<11:32:54,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.9218', 'grad_norm': '0.2543', 'learning_rate': '0.0001969', 'ppl': '2.514', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 3719168, 'tokens/trainable': 3697983, 'epoch': '0.07993'}
  8%|███████████████▎                                                                                                                                                                                | 454/5680 [1:03:14<11:32:54,  7.96s/it]  8%|███████████████▍                                                                                                                                                                                | 455/5680 [1:03:22<11:34:15,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.8138', 'grad_norm': '0.2176', 'learning_rate': '0.0001969', 'ppl': '2.256', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 3727360, 'tokens/trainable': 3706118, 'epoch': '0.08011'}
  8%|███████████████▍                                                                                                                                                                                | 455/5680 [1:03:22<11:34:15,  7.97s/it]  8%|███████████████▍                                                                                                                                                                                | 456/5680 [1:03:30<11:33:30,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6729', 'grad_norm': '0.2122', 'learning_rate': '0.0001969', 'ppl': '1.96', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 3735552, 'tokens/trainable': 3714269, 'epoch': '0.08028'}
  8%|███████████████▍                                                                                                                                                                                | 456/5680 [1:03:30<11:33:30,  7.97s/it]  8%|███████████████▍                                                                                                                                                                                | 457/5680 [1:03:38<11:34:02,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.8434', 'grad_norm': '0.2543', 'learning_rate': '0.0001968', 'ppl': '2.324', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 3743744, 'tokens/trainable': 3722360, 'epoch': '0.08046'}
  8%|███████████████▍                                                                                                                                                                                | 457/5680 [1:03:38<11:34:02,  7.97s/it]  8%|███████████████▍                                                                                                                                                                                | 458/5680 [1:03:46<11:34:04,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.9532', 'grad_norm': '0.2298', 'learning_rate': '0.0001968', 'ppl': '2.594', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 3751936, 'tokens/trainable': 3730503, 'epoch': '0.08063'}
  8%|███████████████▍                                                                                                                                                                                | 458/5680 [1:03:46<11:34:04,  7.97s/it]  8%|███████████████▌                                                                                                                                                                                | 459/5680 [1:03:54<11:33:12,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6659', 'grad_norm': '0.1959', 'learning_rate': '0.0001968', 'ppl': '1.946', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 3760128, 'tokens/trainable': 3738644, 'epoch': '0.08081'}
  8%|███████████████▌                                                                                                                                                                                | 459/5680 [1:03:54<11:33:12,  7.97s/it]  8%|███████████████▌                                                                                                                                                                                | 460/5680 [1:04:02<11:33:23,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7237', 'grad_norm': '0.2133', 'learning_rate': '0.0001968', 'ppl': '2.062', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 3768320, 'tokens/trainable': 3746778, 'epoch': '0.08099'}
  8%|███████████████▌                                                                                                                                                                                | 460/5680 [1:04:02<11:33:23,  7.97s/it]  8%|███████████████▌                                                                                                                                                                                | 461/5680 [1:04:10<11:33:04,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.9526', 'grad_norm': '0.2212', 'learning_rate': '0.0001968', 'ppl': '2.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 3776512, 'tokens/trainable': 3754949, 'epoch': '0.08116'}
  8%|███████████████▌                                                                                                                                                                                | 461/5680 [1:04:10<11:33:04,  7.97s/it]  8%|███████████████▌                                                                                                                                                                                | 462/5680 [1:04:18<11:32:23,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '1.184', 'grad_norm': '0.2759', 'learning_rate': '0.0001968', 'ppl': '3.268', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 3784704, 'tokens/trainable': 3763112, 'epoch': '0.08134'}
  8%|███████████████▌                                                                                                                                                                                | 462/5680 [1:04:18<11:32:23,  7.96s/it]  8%|███████████████▋                                                                                                                                                                                | 463/5680 [1:04:26<11:32:22,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6329', 'grad_norm': '0.1972', 'learning_rate': '0.0001968', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 3792896, 'tokens/trainable': 3771243, 'epoch': '0.08151'}
  8%|███████████████▋                                                                                                                                                                                | 463/5680 [1:04:26<11:32:22,  7.96s/it]  8%|███████████████▋                                                                                                                                                                                | 464/5680 [1:04:34<11:31:44,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.718', 'grad_norm': '0.1973', 'learning_rate': '0.0001967', 'ppl': '2.05', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 3801088, 'tokens/trainable': 3779433, 'epoch': '0.08169'}
  8%|███████████████▋                                                                                                                                                                                | 464/5680 [1:04:34<11:31:44,  7.96s/it]  8%|███████████████▋                                                                                                                                                                                | 465/5680 [1:04:42<11:34:32,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.601', 'grad_norm': '0.1902', 'learning_rate': '0.0001967', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 3809280, 'tokens/trainable': 3787603, 'epoch': '0.08187'}
  8%|███████████████▋                                                                                                                                                                                | 465/5680 [1:04:42<11:34:32,  7.99s/it]  8%|███████████████▊                                                                                                                                                                                | 466/5680 [1:04:50<11:36:26,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6938', 'grad_norm': '0.2101', 'learning_rate': '0.0001967', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 3817472, 'tokens/trainable': 3795736, 'epoch': '0.08204'}
  8%|███████████████▊                                                                                                                                                                                | 466/5680 [1:04:50<11:36:26,  8.01s/it]  8%|███████████████▊                                                                                                                                                                                | 467/5680 [1:04:58<11:38:03,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4554', 'grad_norm': '0.1849', 'learning_rate': '0.0001967', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3825664, 'tokens/trainable': 3803910, 'epoch': '0.08222'}
  8%|███████████████▊                                                                                                                                                                                | 467/5680 [1:04:58<11:38:03,  8.03s/it]  8%|███████████████▊                                                                                                                                                                                | 468/5680 [1:05:06<11:38:23,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.8172', 'grad_norm': '0.2066', 'learning_rate': '0.0001967', 'ppl': '2.264', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 3833856, 'tokens/trainable': 3811986, 'epoch': '0.08239'}
  8%|███████████████▊                                                                                                                                                                                | 468/5680 [1:05:06<11:38:23,  8.04s/it]  8%|███████████████▊                                                                                                                                                                                | 469/5680 [1:05:14<11:38:31,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.4618', 'grad_norm': '0.177', 'learning_rate': '0.0001967', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 3842048, 'tokens/trainable': 3820114, 'epoch': '0.08257'}
  8%|███████████████▊                                                                                                                                                                                | 469/5680 [1:05:14<11:38:31,  8.04s/it]  8%|███████████████▉                                                                                                                                                                                | 470/5680 [1:05:22<11:38:40,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.8889', 'grad_norm': '0.244', 'learning_rate': '0.0001967', 'ppl': '2.432', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 3850240, 'tokens/trainable': 3828265, 'epoch': '0.08275'}
  8%|███████████████▉                                                                                                                                                                                | 470/5680 [1:05:22<11:38:40,  8.05s/it]  8%|███████████████▉                                                                                                                                                                                | 471/5680 [1:05:31<11:38:51,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5763', 'grad_norm': '0.1996', 'learning_rate': '0.0001966', 'ppl': '1.779', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 3858432, 'tokens/trainable': 3836358, 'epoch': '0.08292'}
  8%|███████████████▉                                                                                                                                                                                | 471/5680 [1:05:31<11:38:51,  8.05s/it]  8%|███████████████▉                                                                                                                                                                                | 472/5680 [1:05:39<11:40:05,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.67', 'grad_norm': '0.2244', 'learning_rate': '0.0001966', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 3866624, 'tokens/trainable': 3844476, 'epoch': '0.0831'}
  8%|███████████████▉                                                                                                                                                                                | 472/5680 [1:05:39<11:40:05,  8.07s/it]  8%|███████████████▉                                                                                                                                                                                | 473/5680 [1:05:47<11:39:35,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9149', 'grad_norm': '0.2777', 'learning_rate': '0.0001966', 'ppl': '2.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 3874816, 'tokens/trainable': 3852561, 'epoch': '0.08327'}
  8%|███████████████▉                                                                                                                                                                                | 473/5680 [1:05:47<11:39:35,  8.06s/it]  8%|████████████████                                                                                                                                                                                | 474/5680 [1:05:55<11:40:30,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '1.155', 'grad_norm': '0.2557', 'learning_rate': '0.0001966', 'ppl': '3.173', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 3883008, 'tokens/trainable': 3860707, 'epoch': '0.08345'}
  8%|████████████████                                                                                                                                                                                | 474/5680 [1:05:55<11:40:30,  8.07s/it]  8%|████████████████                                                                                                                                                                                | 475/5680 [1:06:03<11:40:31,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.6914', 'grad_norm': '0.1999', 'learning_rate': '0.0001966', 'ppl': '1.997', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.2', 'tokens/total': 3891200, 'tokens/trainable': 3868770, 'epoch': '0.08363'}
  8%|████████████████                                                                                                                                                                                | 475/5680 [1:06:03<11:40:31,  8.08s/it]  8%|████████████████                                                                                                                                                                                | 476/5680 [1:06:11<11:39:52,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.912', 'grad_norm': '0.2204', 'learning_rate': '0.0001966', 'ppl': '2.489', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 3899392, 'tokens/trainable': 3876868, 'epoch': '0.0838'}
  8%|████████████████                                                                                                                                                                                | 476/5680 [1:06:11<11:39:52,  8.07s/it]  8%|████████████████                                                                                                                                                                                | 477/5680 [1:06:19<11:39:20,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.5289', 'grad_norm': '0.1918', 'learning_rate': '0.0001966', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 3907584, 'tokens/trainable': 3884942, 'epoch': '0.08398'}
  8%|████████████████                                                                                                                                                                                | 477/5680 [1:06:19<11:39:20,  8.06s/it]  8%|████████████████▏                                                                                                                                                                               | 478/5680 [1:06:27<11:38:29,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.6013', 'grad_norm': '0.185', 'learning_rate': '0.0001965', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 3915776, 'tokens/trainable': 3893115, 'epoch': '0.08415'}
  8%|████████████████▏                                                                                                                                                                               | 478/5680 [1:06:27<11:38:29,  8.06s/it]  8%|████████████████▏                                                                                                                                                                               | 479/5680 [1:06:35<11:38:20,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.8034', 'grad_norm': '0.2205', 'learning_rate': '0.0001965', 'ppl': '2.233', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 3923968, 'tokens/trainable': 3901268, 'epoch': '0.08433'}
  8%|████████████████▏                                                                                                                                                                               | 479/5680 [1:06:35<11:38:20,  8.06s/it]  8%|████████████████▏                                                                                                                                                                               | 480/5680 [1:06:43<11:37:59,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.6625', 'grad_norm': '0.2092', 'learning_rate': '0.0001965', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 3932160, 'tokens/trainable': 3909395, 'epoch': '0.08451'}
  8%|████████████████▏                                                                                                                                                                               | 480/5680 [1:06:43<11:37:59,  8.05s/it]  8%|████████████████▎                                                                                                                                                                               | 481/5680 [1:06:51<11:38:42,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7242', 'grad_norm': '0.2144', 'learning_rate': '0.0001965', 'ppl': '2.063', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.5', 'tokens/total': 3940352, 'tokens/trainable': 3917427, 'epoch': '0.08468'}
  8%|████████████████▎                                                                                                                                                                               | 481/5680 [1:06:51<11:38:42,  8.06s/it]  8%|████████████████▎                                                                                                                                                                               | 482/5680 [1:06:59<11:38:37,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7681', 'grad_norm': '0.2412', 'learning_rate': '0.0001965', 'ppl': '2.156', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 3948544, 'tokens/trainable': 3925600, 'epoch': '0.08486'}
  8%|████████████████▎                                                                                                                                                                               | 482/5680 [1:06:59<11:38:37,  8.06s/it]  9%|████████████████▎                                                                                                                                                                               | 483/5680 [1:07:07<11:38:24,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9765', 'grad_norm': '0.2371', 'learning_rate': '0.0001965', 'ppl': '2.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 3956736, 'tokens/trainable': 3933760, 'epoch': '0.08504'}
  9%|████████████████▎                                                                                                                                                                               | 483/5680 [1:07:07<11:38:24,  8.06s/it]  9%|████████████████▎                                                                                                                                                                               | 484/5680 [1:07:15<11:37:29,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.6554', 'grad_norm': '0.1909', 'learning_rate': '0.0001965', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 3964928, 'tokens/trainable': 3941875, 'epoch': '0.08521'}
  9%|████████████████▎                                                                                                                                                                               | 484/5680 [1:07:15<11:37:29,  8.05s/it]  9%|████████████████▍                                                                                                                                                                               | 485/5680 [1:07:23<11:32:46,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.8947', 'grad_norm': '0.225', 'learning_rate': '0.0001964', 'ppl': '2.447', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 3973120, 'tokens/trainable': 3950030, 'epoch': '0.08539'}
  9%|████████████████▍                                                                                                                                                                               | 485/5680 [1:07:23<11:32:46,  8.00s/it]  9%|████████████████▍                                                                                                                                                                               | 486/5680 [1:07:31<11:29:35,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7032', 'grad_norm': '0.2171', 'learning_rate': '0.0001964', 'ppl': '2.02', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 3981312, 'tokens/trainable': 3958174, 'epoch': '0.08556'}
  9%|████████████████▍                                                                                                                                                                               | 486/5680 [1:07:31<11:29:35,  7.97s/it]  9%|████████████████▍                                                                                                                                                                               | 487/5680 [1:07:39<11:27:24,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.016', 'grad_norm': '0.2802', 'learning_rate': '0.0001964', 'ppl': '2.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 3989504, 'tokens/trainable': 3966263, 'epoch': '0.08574'}
  9%|████████████████▍                                                                                                                                                                               | 487/5680 [1:07:39<11:27:24,  7.94s/it]  9%|████████████████▍                                                                                                                                                                               | 488/5680 [1:07:47<11:26:57,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.574', 'grad_norm': '0.1999', 'learning_rate': '0.0001964', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 3997696, 'tokens/trainable': 3974352, 'epoch': '0.08592'}
  9%|████████████████▍                                                                                                                                                                               | 488/5680 [1:07:47<11:26:57,  7.94s/it]  9%|████████████████▌                                                                                                                                                                               | 489/5680 [1:07:55<11:26:35,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6493', 'grad_norm': '0.2097', 'learning_rate': '0.0001964', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 4005888, 'tokens/trainable': 3982465, 'epoch': '0.08609'}
  9%|████████████████▌                                                                                                                                                                               | 489/5680 [1:07:55<11:26:35,  7.94s/it]  9%|████████████████▌                                                                                                                                                                               | 490/5680 [1:08:03<11:26:09,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6663', 'grad_norm': '0.2152', 'learning_rate': '0.0001964', 'ppl': '1.947', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 4014080, 'tokens/trainable': 3990645, 'epoch': '0.08627'}
  9%|████████████████▌                                                                                                                                                                               | 490/5680 [1:08:03<11:26:09,  7.93s/it]  9%|████████████████▌                                                                                                                                                                               | 491/5680 [1:08:11<11:24:55,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8247', 'grad_norm': '0.2203', 'learning_rate': '0.0001963', 'ppl': '2.281', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 4022272, 'tokens/trainable': 3998794, 'epoch': '0.08644'}
  9%|████████████████▌                                                                                                                                                                               | 491/5680 [1:08:11<11:24:55,  7.92s/it]  9%|████████████████▋                                                                                                                                                                               | 492/5680 [1:08:19<11:23:53,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5146', 'grad_norm': '0.1907', 'learning_rate': '0.0001963', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 4030464, 'tokens/trainable': 4006918, 'epoch': '0.08662'}
  9%|████████████████▋                                                                                                                                                                               | 492/5680 [1:08:19<11:23:53,  7.91s/it]  9%|████████████████▋                                                                                                                                                                               | 493/5680 [1:08:26<11:22:27,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8107', 'grad_norm': '0.2524', 'learning_rate': '0.0001963', 'ppl': '2.249', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 4038656, 'tokens/trainable': 4015081, 'epoch': '0.0868'}
  9%|████████████████▋                                                                                                                                                                               | 493/5680 [1:08:26<11:22:27,  7.89s/it]  9%|████████████████▋                                                                                                                                                                               | 494/5680 [1:08:34<11:22:25,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8389', 'grad_norm': '0.2115', 'learning_rate': '0.0001963', 'ppl': '2.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 4046848, 'tokens/trainable': 4023169, 'epoch': '0.08697'}
  9%|████████████████▋                                                                                                                                                                               | 494/5680 [1:08:34<11:22:25,  7.90s/it]  9%|████████████████▋                                                                                                                                                                               | 495/5680 [1:08:42<11:22:29,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5907', 'grad_norm': '0.1976', 'learning_rate': '0.0001963', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 4055040, 'tokens/trainable': 4031317, 'epoch': '0.08715'}
  9%|████████████████▋                                                                                                                                                                               | 495/5680 [1:08:42<11:22:29,  7.90s/it]  9%|████████████████▊                                                                                                                                                                               | 496/5680 [1:08:50<11:21:43,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7879', 'grad_norm': '0.2211', 'learning_rate': '0.0001963', 'ppl': '2.199', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 4063232, 'tokens/trainable': 4039385, 'epoch': '0.08732'}
  9%|████████████████▊                                                                                                                                                                               | 496/5680 [1:08:50<11:21:43,  7.89s/it]  9%|████████████████▊                                                                                                                                                                               | 497/5680 [1:08:58<11:21:16,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.9873', 'grad_norm': '0.2526', 'learning_rate': '0.0001963', 'ppl': '2.684', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 4071424, 'tokens/trainable': 4047405, 'epoch': '0.0875'}
  9%|████████████████▊                                                                                                                                                                               | 497/5680 [1:08:58<11:21:16,  7.89s/it]  9%|████████████████▊                                                                                                                                                                               | 498/5680 [1:09:07<11:40:44,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.5684', 'grad_norm': '0.1873', 'learning_rate': '0.0001962', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '942.4', 'tokens/total': 4079616, 'tokens/trainable': 4055548, 'epoch': '0.08768'}
  9%|████████████████▊                                                                                                                                                                               | 498/5680 [1:09:07<11:40:44,  8.11s/it]  9%|████████████████▊                                                                                                                                                                               | 499/5680 [1:09:15<11:39:58,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.614', 'grad_norm': '0.2077', 'learning_rate': '0.0001962', 'ppl': '1.848', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 4087808, 'tokens/trainable': 4063653, 'epoch': '0.08785'}
  9%|████████████████▊                                                                                                                                                                               | 499/5680 [1:09:15<11:39:58,  8.11s/it]  9%|████████████████▉                                                                                                                                                                               | 500/5680 [1:09:23<11:39:44,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.6402', 'grad_norm': '0.2103', 'learning_rate': '0.0001962', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 4096000, 'tokens/trainable': 4071769, 'epoch': '0.08803'}
  9%|████████████████▉                                                                                                                                                                               | 500/5680 [1:09:23<11:39:44,  8.11s/it]  9%|████████████████▉                                                                                                                                                                               | 501/5680 [1:09:31<11:40:40,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.645', 'grad_norm': '0.2203', 'learning_rate': '0.0001962', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 4104192, 'tokens/trainable': 4079951, 'epoch': '0.0882'}
  9%|████████████████▉                                                                                                                                                                               | 501/5680 [1:09:31<11:40:40,  8.12s/it]  9%|████████████████▉                                                                                                                                                                               | 502/5680 [1:09:39<11:40:36,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.7074', 'grad_norm': '0.2149', 'learning_rate': '0.0001962', 'ppl': '2.029', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.1', 'tokens/total': 4112384, 'tokens/trainable': 4088021, 'epoch': '0.08838'}
  9%|████████████████▉                                                                                                                                                                               | 502/5680 [1:09:39<11:40:36,  8.12s/it]  9%|█████████████████                                                                                                                                                                               | 503/5680 [1:09:47<11:39:09,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.7986', 'grad_norm': '0.2397', 'learning_rate': '0.0001962', 'ppl': '2.222', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 4120576, 'tokens/trainable': 4096108, 'epoch': '0.08856'}
  9%|█████████████████                                                                                                                                                                               | 503/5680 [1:09:47<11:39:09,  8.10s/it]  9%|█████████████████                                                                                                                                                                               | 504/5680 [1:09:55<11:38:59,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.688', 'grad_norm': '0.2065', 'learning_rate': '0.0001962', 'ppl': '1.99', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 4128768, 'tokens/trainable': 4104265, 'epoch': '0.08873'}
  9%|█████████████████                                                                                                                                                                               | 504/5680 [1:09:55<11:38:59,  8.10s/it]  9%|█████████████████                                                                                                                                                                               | 505/5680 [1:10:03<11:40:12,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.7917', 'grad_norm': '0.2397', 'learning_rate': '0.0001961', 'ppl': '2.207', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.5', 'tokens/total': 4136960, 'tokens/trainable': 4112413, 'epoch': '0.08891'}
  9%|█████████████████                                                                                                                                                                               | 505/5680 [1:10:03<11:40:12,  8.12s/it]  9%|█████████████████                                                                                                                                                                               | 506/5680 [1:10:12<11:45:11,  8.18s/it]                                                                                                                                                                                                                                             {'loss': '0.9498', 'grad_norm': '0.2411', 'learning_rate': '0.0001961', 'ppl': '2.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.3', 'tokens/total': 4145152, 'tokens/trainable': 4120580, 'epoch': '0.08908'}
  9%|█████████████████                                                                                                                                                                               | 506/5680 [1:10:12<11:45:11,  8.18s/it]  9%|█████████████████▏                                                                                                                                                                              | 507/5680 [1:10:20<11:42:04,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.4621', 'grad_norm': '0.1757', 'learning_rate': '0.0001961', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 4153344, 'tokens/trainable': 4128689, 'epoch': '0.08926'}
  9%|█████████████████▏                                                                                                                                                                              | 507/5680 [1:10:20<11:42:04,  8.14s/it]  9%|█████████████████▏                                                                                                                                                                              | 508/5680 [1:10:28<11:48:39,  8.22s/it]                                                                                                                                                                                                                                             {'loss': '0.6152', 'grad_norm': '0.2081', 'learning_rate': '0.0001961', 'ppl': '1.85', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '960.6', 'tokens/total': 4161536, 'tokens/trainable': 4136759, 'epoch': '0.08944'}
  9%|█████████████████▏                                                                                                                                                                              | 508/5680 [1:10:28<11:48:39,  8.22s/it]  9%|█████████████████▏                                                                                                                                                                              | 509/5680 [1:10:36<11:44:04,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.6338', 'grad_norm': '0.1989', 'learning_rate': '0.0001961', 'ppl': '1.885', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 4169728, 'tokens/trainable': 4144825, 'epoch': '0.08961'}
  9%|█████████████████▏                                                                                                                                                                              | 509/5680 [1:10:36<11:44:04,  8.17s/it]  9%|█████████████████▏                                                                                                                                                                              | 510/5680 [1:10:44<11:40:25,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.3839', 'grad_norm': '0.1706', 'learning_rate': '0.0001961', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 4177920, 'tokens/trainable': 4152936, 'epoch': '0.08979'}
  9%|█████████████████▏                                                                                                                                                                              | 510/5680 [1:10:44<11:40:25,  8.13s/it]  9%|█████████████████▎                                                                                                                                                                              | 511/5680 [1:10:52<11:39:04,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.6344', 'grad_norm': '0.2319', 'learning_rate': '0.000196', 'ppl': '1.886', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '976.8', 'tokens/total': 4186112, 'tokens/trainable': 4160828, 'epoch': '0.08996'}
  9%|█████████████████▎                                                                                                                                                                              | 511/5680 [1:10:52<11:39:04,  8.11s/it]  9%|█████████████████▎                                                                                                                                                                              | 512/5680 [1:11:00<11:36:15,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.7334', 'grad_norm': '0.2332', 'learning_rate': '0.000196', 'ppl': '2.082', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 4194304, 'tokens/trainable': 4168977, 'epoch': '0.09014'}
  9%|█████████████████▎                                                                                                                                                                              | 512/5680 [1:11:00<11:36:15,  8.08s/it]  9%|█████████████████▎                                                                                                                                                                              | 513/5680 [1:11:08<11:34:56,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.8624', 'grad_norm': '0.2156', 'learning_rate': '0.000196', 'ppl': '2.369', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 4202496, 'tokens/trainable': 4177145, 'epoch': '0.09032'}
  9%|█████████████████▎                                                                                                                                                                              | 513/5680 [1:11:08<11:34:56,  8.07s/it]  9%|█████████████████▎                                                                                                                                                                              | 514/5680 [1:11:17<11:43:02,  8.17s/it]                                                                                                                                                                                                                                             {'loss': '0.8749', 'grad_norm': '0.2599', 'learning_rate': '0.000196', 'ppl': '2.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '952.7', 'tokens/total': 4210688, 'tokens/trainable': 4185135, 'epoch': '0.09049'}
  9%|█████████████████▎                                                                                                                                                                              | 514/5680 [1:11:17<11:43:02,  8.17s/it]  9%|█████████████████▍                                                                                                                                                                              | 515/5680 [1:11:25<11:39:11,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.5015', 'grad_norm': '0.2031', 'learning_rate': '0.000196', 'ppl': '1.651', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 4218880, 'tokens/trainable': 4193254, 'epoch': '0.09067'}
  9%|█████████████████▍                                                                                                                                                                              | 515/5680 [1:11:25<11:39:11,  8.12s/it]  9%|█████████████████▍                                                                                                                                                                              | 516/5680 [1:11:33<11:37:13,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.775', 'grad_norm': '0.2201', 'learning_rate': '0.000196', 'ppl': '2.171', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.9', 'tokens/total': 4227072, 'tokens/trainable': 4201262, 'epoch': '0.09085'}
  9%|█████████████████▍                                                                                                                                                                              | 516/5680 [1:11:33<11:37:13,  8.10s/it]  9%|█████████████████▍                                                                                                                                                                              | 517/5680 [1:11:41<11:34:49,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6266', 'grad_norm': '0.1985', 'learning_rate': '0.000196', 'ppl': '1.871', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 4235264, 'tokens/trainable': 4209429, 'epoch': '0.09102'}
  9%|█████████████████▍                                                                                                                                                                              | 517/5680 [1:11:41<11:34:49,  8.07s/it]  9%|█████████████████▌                                                                                                                                                                              | 518/5680 [1:11:49<11:32:18,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5653', 'grad_norm': '0.2447', 'learning_rate': '0.0001959', 'ppl': '1.76', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 4243456, 'tokens/trainable': 4217459, 'epoch': '0.0912'}
  9%|█████████████████▌                                                                                                                                                                              | 518/5680 [1:11:49<11:32:18,  8.05s/it]  9%|█████████████████▌                                                                                                                                                                              | 519/5680 [1:11:57<11:34:44,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '1.159', 'grad_norm': '0.2413', 'learning_rate': '0.0001959', 'ppl': '3.185', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.4', 'tokens/total': 4251648, 'tokens/trainable': 4225427, 'epoch': '0.09137'}
  9%|█████████████████▌                                                                                                                                                                              | 519/5680 [1:11:57<11:34:44,  8.08s/it]  9%|█████████████████▌                                                                                                                                                                              | 520/5680 [1:12:06<11:45:13,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.8105', 'grad_norm': '0.2101', 'learning_rate': '0.0001959', 'ppl': '2.249', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '958.3', 'tokens/total': 4259840, 'tokens/trainable': 4233559, 'epoch': '0.09155'}
  9%|█████████████████▌                                                                                                                                                                              | 520/5680 [1:12:06<11:45:13,  8.20s/it]  9%|█████████████████▌                                                                                                                                                                              | 521/5680 [1:12:33<20:05:56, 14.03s/it]                                                                                                                                                                                                                                             {'loss': '0.8827', 'grad_norm': '0.2254', 'learning_rate': '0.0001959', 'ppl': '2.418', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '286', 'tokens/total': 4268032, 'tokens/trainable': 4241458, 'epoch': '0.09173'}
  9%|█████████████████▌                                                                                                                                                                              | 521/5680 [1:12:33<20:05:56, 14.03s/it]  9%|█████████████████▋                                                                                                                                                                              | 522/5680 [1:12:49<20:45:51, 14.49s/it]                                                                                                                                                                                                                                             {'loss': '0.6964', 'grad_norm': '0.206', 'learning_rate': '0.0001959', 'ppl': '2.007', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '522.9', 'tokens/total': 4276224, 'tokens/trainable': 4249598, 'epoch': '0.0919'}
  9%|█████████████████▋                                                                                                                                                                              | 522/5680 [1:12:49<20:45:51, 14.49s/it]  9%|█████████████████▋                                                                                                                                                                              | 523/5680 [1:12:57<18:02:04, 12.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6489', 'grad_norm': '0.2016', 'learning_rate': '0.0001959', 'ppl': '1.913', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 4284416, 'tokens/trainable': 4257748, 'epoch': '0.09208'}
  9%|█████████████████▋                                                                                                                                                                              | 523/5680 [1:12:57<18:02:04, 12.59s/it]  9%|█████████████████▋                                                                                                                                                                              | 524/5680 [1:13:05<16:07:33, 11.26s/it]                                                                                                                                                                                                                                             {'loss': '0.7609', 'grad_norm': '0.2138', 'learning_rate': '0.0001958', 'ppl': '2.14', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990', 'tokens/total': 4292608, 'tokens/trainable': 4265820, 'epoch': '0.09225'}
  9%|█████████████████▋                                                                                                                                                                              | 524/5680 [1:13:05<16:07:33, 11.26s/it]  9%|█████████████████▋                                                                                                                                                                              | 525/5680 [1:13:13<14:49:35, 10.35s/it]                                                                                                                                                                                                                                             {'loss': '1.002', 'grad_norm': '0.2597', 'learning_rate': '0.0001958', 'ppl': '2.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972', 'tokens/total': 4300800, 'tokens/trainable': 4273827, 'epoch': '0.09243'}
  9%|█████████████████▋                                                                                                                                                                              | 525/5680 [1:13:13<14:49:35, 10.35s/it]  9%|█████████████████▊                                                                                                                                                                              | 526/5680 [1:13:22<13:55:32,  9.73s/it]                                                                                                                                                                                                                                             {'loss': '0.8312', 'grad_norm': '0.2482', 'learning_rate': '0.0001958', 'ppl': '2.296', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '969.8', 'tokens/total': 4308992, 'tokens/trainable': 4281839, 'epoch': '0.09261'}
  9%|█████████████████▊                                                                                                                                                                              | 526/5680 [1:13:22<13:55:32,  9.73s/it]  9%|█████████████████▊                                                                                                                                                                              | 527/5680 [1:13:30<13:17:58,  9.29s/it]                                                                                                                                                                                                                                             {'loss': '0.5971', 'grad_norm': '0.2069', 'learning_rate': '0.0001958', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982', 'tokens/total': 4317184, 'tokens/trainable': 4289963, 'epoch': '0.09278'}
  9%|█████████████████▊                                                                                                                                                                              | 527/5680 [1:13:30<13:17:58,  9.29s/it]  9%|█████████████████▊                                                                                                                                                                              | 528/5680 [1:13:45<15:38:53, 10.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7937', 'grad_norm': '0.2556', 'learning_rate': '0.0001958', 'ppl': '2.212', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '551.7', 'tokens/total': 4325376, 'tokens/trainable': 4298109, 'epoch': '0.09296'}
  9%|█████████████████▊                                                                                                                                                                              | 528/5680 [1:13:45<15:38:53, 10.93s/it]  9%|█████████████████▉                                                                                                                                                                              | 529/5680 [1:14:00<17:36:09, 12.30s/it]                                                                                                                                                                                                                                             {'loss': '0.6879', 'grad_norm': '0.2186', 'learning_rate': '0.0001958', 'ppl': '1.99', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '529.5', 'tokens/total': 4333568, 'tokens/trainable': 4306273, 'epoch': '0.09313'}
  9%|█████████████████▉                                                                                                                                                                              | 529/5680 [1:14:00<17:36:09, 12.30s/it]  9%|█████████████████▉                                                                                                                                                                              | 530/5680 [1:14:14<18:16:31, 12.77s/it]                                                                                                                                                                                                                                             {'loss': '0.5282', 'grad_norm': '0.2026', 'learning_rate': '0.0001958', 'ppl': '1.696', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '583.6', 'tokens/total': 4341760, 'tokens/trainable': 4314370, 'epoch': '0.09331'}
  9%|█████████████████▉                                                                                                                                                                              | 530/5680 [1:14:14<18:16:31, 12.77s/it]  9%|█████████████████▉                                                                                                                                                                              | 531/5680 [1:14:28<18:43:09, 13.09s/it]                                                                                                                                                                                                                                             {'loss': '0.8583', 'grad_norm': '0.2279', 'learning_rate': '0.0001957', 'ppl': '2.359', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '558.3', 'tokens/total': 4349952, 'tokens/trainable': 4322080, 'epoch': '0.09349'}
  9%|█████████████████▉                                                                                                                                                                              | 531/5680 [1:14:28<18:43:09, 13.09s/it]  9%|█████████████████▉                                                                                                                                                                              | 532/5680 [1:14:42<19:02:04, 13.31s/it]                                                                                                                                                                                                                                             {'loss': '0.614', 'grad_norm': '0.2129', 'learning_rate': '0.0001957', 'ppl': '1.848', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '569.6', 'tokens/total': 4358144, 'tokens/trainable': 4329951, 'epoch': '0.09366'}
  9%|█████████████████▉                                                                                                                                                                              | 532/5680 [1:14:42<19:02:04, 13.31s/it]  9%|██████████████████                                                                                                                                                                              | 533/5680 [1:14:55<19:16:44, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.973', 'grad_norm': '0.2538', 'learning_rate': '0.0001957', 'ppl': '2.646', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '582.3', 'tokens/total': 4366336, 'tokens/trainable': 4338033, 'epoch': '0.09384'}
  9%|██████████████████                                                                                                                                                                              | 533/5680 [1:14:55<19:16:44, 13.48s/it]  9%|██████████████████                                                                                                                                                                              | 534/5680 [1:15:09<19:23:59, 13.57s/it]                                                                                                                                                                                                                                             {'loss': '0.7876', 'grad_norm': '0.2196', 'learning_rate': '0.0001957', 'ppl': '2.198', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '583.4', 'tokens/total': 4374528, 'tokens/trainable': 4346065, 'epoch': '0.09401'}
  9%|██████████████████                                                                                                                                                                              | 534/5680 [1:15:09<19:23:59, 13.57s/it]  9%|██████████████████                                                                                                                                                                              | 535/5680 [1:15:23<19:39:58, 13.76s/it]                                                                                                                                                                                                                                             {'loss': '0.7359', 'grad_norm': '0.2454', 'learning_rate': '0.0001957', 'ppl': '2.087', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '540.8', 'tokens/total': 4382720, 'tokens/trainable': 4353741, 'epoch': '0.09419'}
  9%|██████████████████                                                                                                                                                                              | 535/5680 [1:15:23<19:39:58, 13.76s/it]  9%|██████████████████                                                                                                                                                                              | 536/5680 [1:15:37<19:43:15, 13.80s/it]                                                                                                                                                                                                                                             {'loss': '0.7204', 'grad_norm': '0.2381', 'learning_rate': '0.0001957', 'ppl': '2.055', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '563.9', 'tokens/total': 4390912, 'tokens/trainable': 4361575, 'epoch': '0.09437'}
  9%|██████████████████                                                                                                                                                                              | 536/5680 [1:15:37<19:43:15, 13.80s/it]  9%|██████████████████▏                                                                                                                                                                             | 537/5680 [1:15:51<19:35:12, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.8296', 'grad_norm': '0.22', 'learning_rate': '0.0001956', 'ppl': '2.292', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588.7', 'tokens/total': 4399104, 'tokens/trainable': 4369516, 'epoch': '0.09454'}
  9%|██████████████████▏                                                                                                                                                                             | 537/5680 [1:15:51<19:35:12, 13.71s/it]  9%|██████████████████▏                                                                                                                                                                             | 538/5680 [1:16:04<19:30:42, 13.66s/it]                                                                                                                                                                                                                                             {'loss': '0.7506', 'grad_norm': '0.225', 'learning_rate': '0.0001956', 'ppl': '2.118', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '604.7', 'tokens/total': 4407296, 'tokens/trainable': 4377702, 'epoch': '0.09472'}
  9%|██████████████████▏                                                                                                                                                                             | 538/5680 [1:16:04<19:30:42, 13.66s/it]  9%|██████████████████▏                                                                                                                                                                             | 539/5680 [1:16:18<19:30:45, 13.66s/it]                                                                                                                                                                                                                                             {'loss': '0.7302', 'grad_norm': '0.2368', 'learning_rate': '0.0001956', 'ppl': '2.076', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '579.8', 'tokens/total': 4415488, 'tokens/trainable': 4385623, 'epoch': '0.09489'}
  9%|██████████████████▏                                                                                                                                                                             | 539/5680 [1:16:18<19:30:45, 13.66s/it] 10%|██████████████████▎                                                                                                                                                                             | 540/5680 [1:16:32<19:28:48, 13.64s/it]                                                                                                                                                                                                                                             {'loss': '0.7813', 'grad_norm': '0.2198', 'learning_rate': '0.0001956', 'ppl': '2.184', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.2', 'tokens/total': 4423680, 'tokens/trainable': 4393641, 'epoch': '0.09507'}
 10%|██████████████████▎                                                                                                                                                                             | 540/5680 [1:16:32<19:28:48, 13.64s/it] 10%|██████████████████▎                                                                                                                                                                             | 541/5680 [1:16:45<19:27:12, 13.63s/it]                                                                                                                                                                                                                                             {'loss': '0.6803', 'grad_norm': '0.2239', 'learning_rate': '0.0001956', 'ppl': '1.975', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '581.5', 'tokens/total': 4431872, 'tokens/trainable': 4401536, 'epoch': '0.09525'}
 10%|██████████████████▎                                                                                                                                                                             | 541/5680 [1:16:45<19:27:12, 13.63s/it] 10%|██████████████████▎                                                                                                                                                                             | 542/5680 [1:16:59<19:20:56, 13.56s/it]                                                                                                                                                                                                                                             {'loss': '0.7081', 'grad_norm': '0.2489', 'learning_rate': '0.0001956', 'ppl': '2.03', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598.1', 'tokens/total': 4440064, 'tokens/trainable': 4409539, 'epoch': '0.09542'}
 10%|██████████████████▎                                                                                                                                                                             | 542/5680 [1:16:59<19:20:56, 13.56s/it] 10%|██████████████████▎                                                                                                                                                                             | 543/5680 [1:17:12<19:23:22, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.868', 'grad_norm': '0.266', 'learning_rate': '0.0001955', 'ppl': '2.382', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595.1', 'tokens/total': 4448256, 'tokens/trainable': 4417662, 'epoch': '0.0956'}
 10%|██████████████████▎                                                                                                                                                                             | 543/5680 [1:17:12<19:23:22, 13.59s/it] 10%|██████████████████▍                                                                                                                                                                             | 544/5680 [1:17:27<19:39:40, 13.78s/it]                                                                                                                                                                                                                                             {'loss': '0.6175', 'grad_norm': '0.2233', 'learning_rate': '0.0001955', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '564.5', 'tokens/total': 4456448, 'tokens/trainable': 4425694, 'epoch': '0.09577'}
 10%|██████████████████▍                                                                                                                                                                             | 544/5680 [1:17:27<19:39:40, 13.78s/it] 10%|██████████████████▍                                                                                                                                                                             | 545/5680 [1:17:41<19:46:14, 13.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5208', 'grad_norm': '0.1863', 'learning_rate': '0.0001955', 'ppl': '1.683', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '562.7', 'tokens/total': 4464640, 'tokens/trainable': 4433591, 'epoch': '0.09595'}
 10%|██████████████████▍                                                                                                                                                                             | 545/5680 [1:17:41<19:46:14, 13.86s/it] 10%|██████████████████▍                                                                                                                                                                             | 546/5680 [1:17:55<20:02:58, 14.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7497', 'grad_norm': '0.2148', 'learning_rate': '0.0001955', 'ppl': '2.116', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '563', 'tokens/total': 4472832, 'tokens/trainable': 4441762, 'epoch': '0.09613'}
 10%|██████████████████▍                                                                                                                                                                             | 546/5680 [1:17:55<20:02:58, 14.06s/it] 10%|██████████████████▍                                                                                                                                                                             | 547/5680 [1:18:09<20:08:55, 14.13s/it]                                                                                                                                                                                                                                             {'loss': '0.6472', 'grad_norm': '0.2239', 'learning_rate': '0.0001955', 'ppl': '1.91', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '559.8', 'tokens/total': 4481024, 'tokens/trainable': 4449762, 'epoch': '0.0963'}
 10%|██████████████████▍                                                                                                                                                                             | 547/5680 [1:18:09<20:08:55, 14.13s/it] 10%|██████████████████▌                                                                                                                                                                             | 548/5680 [1:18:23<20:06:21, 14.10s/it]                                                                                                                                                                                                                                             {'loss': '0.897', 'grad_norm': '0.2488', 'learning_rate': '0.0001955', 'ppl': '2.452', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '577.3', 'tokens/total': 4489216, 'tokens/trainable': 4457862, 'epoch': '0.09648'}
 10%|██████████████████▌                                                                                                                                                                             | 548/5680 [1:18:23<20:06:21, 14.10s/it] 10%|██████████████████▌                                                                                                                                                                             | 549/5680 [1:18:37<20:02:00, 14.06s/it]                                                                                                                                                                                                                                             {'loss': '0.866', 'grad_norm': '0.2443', 'learning_rate': '0.0001954', 'ppl': '2.377', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '562.6', 'tokens/total': 4497408, 'tokens/trainable': 4465698, 'epoch': '0.09665'}
 10%|██████████████████▌                                                                                                                                                                             | 549/5680 [1:18:37<20:02:00, 14.06s/it] 10%|██████████████████▌                                                                                                                                                                             | 550/5680 [1:18:51<19:41:51, 13.82s/it]                                                                                                                                                                                                                                             {'loss': '1.137', 'grad_norm': '0.2748', 'learning_rate': '0.0001954', 'ppl': '3.119', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600.7', 'tokens/total': 4505600, 'tokens/trainable': 4473672, 'epoch': '0.09683'}
 10%|██████████████████▌                                                                                                                                                                             | 550/5680 [1:18:51<19:41:51, 13.82s/it] 10%|██████████████████▋                                                                                                                                                                             | 551/5680 [1:19:04<19:36:44, 13.77s/it]                                                                                                                                                                                                                                             {'loss': '0.5683', 'grad_norm': '0.2005', 'learning_rate': '0.0001954', 'ppl': '1.765', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598.9', 'tokens/total': 4513792, 'tokens/trainable': 4481830, 'epoch': '0.09701'}
 10%|██████████████████▋                                                                                                                                                                             | 551/5680 [1:19:04<19:36:44, 13.77s/it] 10%|██████████████████▋                                                                                                                                                                             | 552/5680 [1:19:18<19:34:05, 13.74s/it]                                                                                                                                                                                                                                             {'loss': '0.9508', 'grad_norm': '0.2529', 'learning_rate': '0.0001954', 'ppl': '2.588', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '573', 'tokens/total': 4521984, 'tokens/trainable': 4489658, 'epoch': '0.09718'}
 10%|██████████████████▋                                                                                                                                                                             | 552/5680 [1:19:18<19:34:05, 13.74s/it] 10%|██████████████████▋                                                                                                                                                                             | 553/5680 [1:19:32<19:30:59, 13.70s/it]                                                                                                                                                                                                                                             {'loss': '0.8259', 'grad_norm': '0.258', 'learning_rate': '0.0001954', 'ppl': '2.284', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '572.5', 'tokens/total': 4530176, 'tokens/trainable': 4497453, 'epoch': '0.09736'}
 10%|██████████████████▋                                                                                                                                                                             | 553/5680 [1:19:32<19:30:59, 13.70s/it] 10%|██████████████████▋                                                                                                                                                                             | 554/5680 [1:19:46<19:37:33, 13.78s/it]                                                                                                                                                                                                                                             {'loss': '0.6116', 'grad_norm': '0.2156', 'learning_rate': '0.0001954', 'ppl': '1.843', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '580.2', 'tokens/total': 4538368, 'tokens/trainable': 4505552, 'epoch': '0.09754'}
 10%|██████████████████▋                                                                                                                                                                             | 554/5680 [1:19:46<19:37:33, 13.78s/it] 10%|██████████████████▊                                                                                                                                                                             | 555/5680 [1:20:00<19:50:19, 13.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5828', 'grad_norm': '0.1953', 'learning_rate': '0.0001953', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '572.8', 'tokens/total': 4546560, 'tokens/trainable': 4513728, 'epoch': '0.09771'}
 10%|██████████████████▊                                                                                                                                                                             | 555/5680 [1:20:00<19:50:19, 13.94s/it] 10%|██████████████████▊                                                                                                                                                                             | 556/5680 [1:20:14<20:01:31, 14.07s/it]                                                                                                                                                                                                                                             {'loss': '0.7181', 'grad_norm': '0.2282', 'learning_rate': '0.0001953', 'ppl': '2.051', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '550.9', 'tokens/total': 4554752, 'tokens/trainable': 4521649, 'epoch': '0.09789'}
 10%|██████████████████▊                                                                                                                                                                             | 556/5680 [1:20:14<20:01:31, 14.07s/it] 10%|██████████████████▊                                                                                                                                                                             | 557/5680 [1:20:28<20:01:03, 14.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6216', 'grad_norm': '0.2271', 'learning_rate': '0.0001953', 'ppl': '1.862', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '539.6', 'tokens/total': 4562944, 'tokens/trainable': 4529235, 'epoch': '0.09806'}
 10%|██████████████████▊                                                                                                                                                                             | 557/5680 [1:20:28<20:01:03, 14.07s/it] 10%|██████████████████▊                                                                                                                                                                             | 558/5680 [1:20:42<19:44:47, 13.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4553', 'grad_norm': '0.1911', 'learning_rate': '0.0001953', 'ppl': '1.577', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '602.2', 'tokens/total': 4571136, 'tokens/trainable': 4537324, 'epoch': '0.09824'}
 10%|██████████████████▊                                                                                                                                                                             | 558/5680 [1:20:42<19:44:47, 13.88s/it] 10%|██████████████████▉                                                                                                                                                                             | 559/5680 [1:20:55<19:38:19, 13.81s/it]                                                                                                                                                                                                                                             {'loss': '0.8592', 'grad_norm': '0.2482', 'learning_rate': '0.0001953', 'ppl': '2.361', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.9', 'tokens/total': 4579328, 'tokens/trainable': 4545407, 'epoch': '0.09842'}
 10%|██████████████████▉                                                                                                                                                                             | 559/5680 [1:20:55<19:38:19, 13.81s/it] 10%|██████████████████▉                                                                                                                                                                             | 560/5680 [1:21:09<19:45:57, 13.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7349', 'grad_norm': '0.2046', 'learning_rate': '0.0001953', 'ppl': '2.085', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '560.4', 'tokens/total': 4587520, 'tokens/trainable': 4553314, 'epoch': '0.09859'}
 10%|██████████████████▉                                                                                                                                                                             | 560/5680 [1:21:10<19:45:57, 13.90s/it] 10%|██████████████████▉                                                                                                                                                                             | 561/5680 [1:21:23<19:44:40, 13.89s/it]                                                                                                                                                                                                                                             {'loss': '1.015', 'grad_norm': '0.2561', 'learning_rate': '0.0001952', 'ppl': '2.759', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '577.4', 'tokens/total': 4595712, 'tokens/trainable': 4561305, 'epoch': '0.09877'}
 10%|██████████████████▉                                                                                                                                                                             | 561/5680 [1:21:23<19:44:40, 13.89s/it] 10%|██████████████████▉                                                                                                                                                                             | 562/5680 [1:21:37<19:34:32, 13.77s/it]                                                                                                                                                                                                                                             {'loss': '0.7664', 'grad_norm': '0.2146', 'learning_rate': '0.0001952', 'ppl': '2.152', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '566.4', 'tokens/total': 4603904, 'tokens/trainable': 4568945, 'epoch': '0.09894'}
 10%|██████████████████▉                                                                                                                                                                             | 562/5680 [1:21:37<19:34:32, 13.77s/it] 10%|███████████████████                                                                                                                                                                             | 563/5680 [1:21:50<19:27:34, 13.69s/it]                                                                                                                                                                                                                                             {'loss': '0.801', 'grad_norm': '0.2162', 'learning_rate': '0.0001952', 'ppl': '2.228', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '584.4', 'tokens/total': 4612096, 'tokens/trainable': 4576837, 'epoch': '0.09912'}
 10%|███████████████████                                                                                                                                                                             | 563/5680 [1:21:50<19:27:34, 13.69s/it] 10%|███████████████████                                                                                                                                                                             | 564/5680 [1:22:04<19:26:19, 13.68s/it]                                                                                                                                                                                                                                             {'loss': '0.8921', 'grad_norm': '0.2659', 'learning_rate': '0.0001952', 'ppl': '2.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '573.8', 'tokens/total': 4620288, 'tokens/trainable': 4584664, 'epoch': '0.0993'}
 10%|███████████████████                                                                                                                                                                             | 564/5680 [1:22:04<19:26:19, 13.68s/it] 10%|███████████████████                                                                                                                                                                             | 565/5680 [1:22:17<19:21:14, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.5324', 'grad_norm': '0.2285', 'learning_rate': '0.0001952', 'ppl': '1.703', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '577', 'tokens/total': 4628480, 'tokens/trainable': 4592441, 'epoch': '0.09947'}
 10%|███████████████████                                                                                                                                                                             | 565/5680 [1:22:17<19:21:14, 13.62s/it] 10%|███████████████████▏                                                                                                                                                                            | 566/5680 [1:22:31<19:18:38, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.9154', 'grad_norm': '0.2344', 'learning_rate': '0.0001952', 'ppl': '2.498', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.3', 'tokens/total': 4636672, 'tokens/trainable': 4600449, 'epoch': '0.09965'}
 10%|███████████████████▏                                                                                                                                                                            | 566/5680 [1:22:31<19:18:38, 13.59s/it] 10%|███████████████████▏                                                                                                                                                                            | 567/5680 [1:22:45<19:28:44, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.5969', 'grad_norm': '0.2135', 'learning_rate': '0.0001951', 'ppl': '1.816', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.3', 'tokens/total': 4644864, 'tokens/trainable': 4608634, 'epoch': '0.09982'}
 10%|███████████████████▏                                                                                                                                                                            | 567/5680 [1:22:45<19:28:44, 13.71s/it] 10%|███████████████████▏                                                                                                                                                                            | 568/5680 [1:22:59<19:31:00, 13.74s/it]                                                                                                                                                                                                                                             {'loss': '0.7801', 'grad_norm': '0.2163', 'learning_rate': '0.0001951', 'ppl': '2.182', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '573.2', 'tokens/total': 4653056, 'tokens/trainable': 4616549, 'epoch': '0.1'}
 10%|███████████████████▏                                                                                                                                                                            | 568/5680 [1:22:59<19:31:00, 13.74s/it] 10%|███████████████████▏                                                                                                                                                                            | 569/5680 [1:23:13<19:37:17, 13.82s/it]                                                                                                                                                                                                                                             {'loss': '0.7955', 'grad_norm': '0.2315', 'learning_rate': '0.0001951', 'ppl': '2.215', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '570.3', 'tokens/total': 4661248, 'tokens/trainable': 4624528, 'epoch': '0.1002'}
 10%|███████████████████▏                                                                                                                                                                            | 569/5680 [1:23:13<19:37:17, 13.82s/it] 10%|███████████████████▎                                                                                                                                                                            | 570/5680 [1:23:27<19:39:45, 13.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6823', 'grad_norm': '0.228', 'learning_rate': '0.0001951', 'ppl': '1.978', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '574.7', 'tokens/total': 4669440, 'tokens/trainable': 4632530, 'epoch': '0.1004'}
 10%|███████████████████▎                                                                                                                                                                            | 570/5680 [1:23:27<19:39:45, 13.85s/it] 10%|███████████████████▎                                                                                                                                                                            | 571/5680 [1:23:41<19:46:06, 13.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6372', 'grad_norm': '0.2045', 'learning_rate': '0.0001951', 'ppl': '1.891', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '556.7', 'tokens/total': 4677632, 'tokens/trainable': 4640381, 'epoch': '0.1005'}
 10%|███████████████████▎                                                                                                                                                                            | 571/5680 [1:23:41<19:46:06, 13.93s/it] 10%|███████████████████▎                                                                                                                                                                            | 572/5680 [1:23:55<19:48:10, 13.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7509', 'grad_norm': '0.2158', 'learning_rate': '0.0001951', 'ppl': '2.119', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '578.6', 'tokens/total': 4685824, 'tokens/trainable': 4648488, 'epoch': '0.1007'}
 10%|███████████████████▎                                                                                                                                                                            | 572/5680 [1:23:55<19:48:10, 13.96s/it] 10%|███████████████████▎                                                                                                                                                                            | 573/5680 [1:24:08<19:37:36, 13.84s/it]                                                                                                                                                                                                                                             {'loss': '0.809', 'grad_norm': '0.2344', 'learning_rate': '0.000195', 'ppl': '2.246', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '602.3', 'tokens/total': 4694016, 'tokens/trainable': 4656646, 'epoch': '0.1009'}
 10%|███████████████████▎                                                                                                                                                                            | 573/5680 [1:24:08<19:37:36, 13.84s/it] 10%|███████████████████▍                                                                                                                                                                            | 574/5680 [1:24:22<19:24:12, 13.68s/it]                                                                                                                                                                                                                                             {'loss': '0.7633', 'grad_norm': '0.2318', 'learning_rate': '0.000195', 'ppl': '2.145', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '609.6', 'tokens/total': 4702208, 'tokens/trainable': 4664758, 'epoch': '0.1011'}
 10%|███████████████████▍                                                                                                                                                                            | 574/5680 [1:24:22<19:24:12, 13.68s/it] 10%|███████████████████▍                                                                                                                                                                            | 575/5680 [1:24:35<19:09:07, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.8961', 'grad_norm': '0.2457', 'learning_rate': '0.000195', 'ppl': '2.45', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '609.1', 'tokens/total': 4710400, 'tokens/trainable': 4672734, 'epoch': '0.1012'}
 10%|███████████████████▍                                                                                                                                                                            | 575/5680 [1:24:35<19:09:07, 13.51s/it] 10%|███████████████████▍                                                                                                                                                                            | 576/5680 [1:24:48<19:00:54, 13.41s/it]                                                                                                                                                                                                                                             {'loss': '0.8694', 'grad_norm': '0.2594', 'learning_rate': '0.000195', 'ppl': '2.385', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '580.2', 'tokens/total': 4718592, 'tokens/trainable': 4680381, 'epoch': '0.1014'}
 10%|███████████████████▍                                                                                                                                                                            | 576/5680 [1:24:48<19:00:54, 13.41s/it] 10%|███████████████████▌                                                                                                                                                                            | 577/5680 [1:25:01<19:00:41, 13.41s/it]                                                                                                                                                                                                                                             {'loss': '0.8832', 'grad_norm': '0.2568', 'learning_rate': '0.000195', 'ppl': '2.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '605.7', 'tokens/total': 4726784, 'tokens/trainable': 4688496, 'epoch': '0.1016'}
 10%|███████████████████▌                                                                                                                                                                            | 577/5680 [1:25:01<19:00:41, 13.41s/it] 10%|███████████████████▌                                                                                                                                                                            | 578/5680 [1:25:15<19:00:49, 13.42s/it]                                                                                                                                                                                                                                             {'loss': '0.9096', 'grad_norm': '0.2192', 'learning_rate': '0.000195', 'ppl': '2.483', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '602.8', 'tokens/total': 4734976, 'tokens/trainable': 4696584, 'epoch': '0.1018'}
 10%|███████████████████▌                                                                                                                                                                            | 578/5680 [1:25:15<19:00:49, 13.42s/it] 10%|███████████████████▌                                                                                                                                                                            | 579/5680 [1:25:28<19:05:20, 13.47s/it]                                                                                                                                                                                                                                             {'loss': '0.5908', 'grad_norm': '0.193', 'learning_rate': '0.0001949', 'ppl': '1.806', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.6', 'tokens/total': 4743168, 'tokens/trainable': 4704566, 'epoch': '0.1019'}
 10%|███████████████████▌                                                                                                                                                                            | 579/5680 [1:25:28<19:05:20, 13.47s/it] 10%|███████████████████▌                                                                                                                                                                            | 580/5680 [1:25:42<19:09:50, 13.53s/it]                                                                                                                                                                                                                                             {'loss': '0.8135', 'grad_norm': '0.2351', 'learning_rate': '0.0001949', 'ppl': '2.256', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '583', 'tokens/total': 4751360, 'tokens/trainable': 4712522, 'epoch': '0.1021'}
 10%|███████████████████▌                                                                                                                                                                            | 580/5680 [1:25:42<19:09:50, 13.53s/it] 10%|███████████████████▋                                                                                                                                                                            | 581/5680 [1:25:56<19:09:44, 13.53s/it]                                                                                                                                                                                                                                             {'loss': '0.8537', 'grad_norm': '0.2202', 'learning_rate': '0.0001949', 'ppl': '2.348', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '573.1', 'tokens/total': 4759552, 'tokens/trainable': 4720266, 'epoch': '0.1023'}
 10%|███████████████████▋                                                                                                                                                                            | 581/5680 [1:25:56<19:09:44, 13.53s/it] 10%|███████████████████▋                                                                                                                                                                            | 582/5680 [1:26:10<19:21:59, 13.68s/it]                                                                                                                                                                                                                                             {'loss': '0.6298', 'grad_norm': '0.2055', 'learning_rate': '0.0001949', 'ppl': '1.877', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '581.5', 'tokens/total': 4767744, 'tokens/trainable': 4728413, 'epoch': '0.1025'}
 10%|███████████████████▋                                                                                                                                                                            | 582/5680 [1:26:10<19:21:59, 13.68s/it] 10%|███████████████████▋                                                                                                                                                                            | 583/5680 [1:26:24<19:26:31, 13.73s/it]                                                                                                                                                                                                                                             {'loss': '0.7339', 'grad_norm': '0.1983', 'learning_rate': '0.0001949', 'ppl': '2.083', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588.2', 'tokens/total': 4775936, 'tokens/trainable': 4736565, 'epoch': '0.1026'}
 10%|███████████████████▋                                                                                                                                                                            | 583/5680 [1:26:24<19:26:31, 13.73s/it] 10%|███████████████████▋                                                                                                                                                                            | 584/5680 [1:26:38<19:41:27, 13.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6151', 'grad_norm': '0.2477', 'learning_rate': '0.0001948', 'ppl': '1.85', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '565.4', 'tokens/total': 4784128, 'tokens/trainable': 4744659, 'epoch': '0.1028'}
 10%|███████████████████▋                                                                                                                                                                            | 584/5680 [1:26:38<19:41:27, 13.91s/it] 10%|███████████████████▊                                                                                                                                                                            | 585/5680 [1:26:52<19:37:39, 13.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7391', 'grad_norm': '0.2224', 'learning_rate': '0.0001948', 'ppl': '2.094', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.9', 'tokens/total': 4792320, 'tokens/trainable': 4752778, 'epoch': '0.103'}
 10%|███████████████████▊                                                                                                                                                                            | 585/5680 [1:26:52<19:37:39, 13.87s/it] 10%|███████████████████▊                                                                                                                                                                            | 586/5680 [1:27:06<19:47:13, 13.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4558', 'grad_norm': '0.1794', 'learning_rate': '0.0001948', 'ppl': '1.577', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '534.3', 'tokens/total': 4800512, 'tokens/trainable': 4760389, 'epoch': '0.1032'}
 10%|███████████████████▊                                                                                                                                                                            | 586/5680 [1:27:06<19:47:13, 13.98s/it] 10%|███████████████████▊                                                                                                                                                                            | 587/5680 [1:27:20<19:39:26, 13.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4438', 'grad_norm': '0.1802', 'learning_rate': '0.0001948', 'ppl': '1.559', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '571.5', 'tokens/total': 4808704, 'tokens/trainable': 4768207, 'epoch': '0.1033'}
 10%|███████████████████▊                                                                                                                                                                            | 587/5680 [1:27:20<19:39:26, 13.89s/it] 10%|███████████████████▉                                                                                                                                                                            | 588/5680 [1:27:33<19:23:50, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.8028', 'grad_norm': '0.2339', 'learning_rate': '0.0001948', 'ppl': '2.232', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.2', 'tokens/total': 4816896, 'tokens/trainable': 4776006, 'epoch': '0.1035'}
 10%|███████████████████▉                                                                                                                                                                            | 588/5680 [1:27:33<19:23:50, 13.71s/it] 10%|███████████████████▉                                                                                                                                                                            | 589/5680 [1:27:46<19:15:20, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.6129', 'grad_norm': '0.201', 'learning_rate': '0.0001948', 'ppl': '1.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.6', 'tokens/total': 4825088, 'tokens/trainable': 4784028, 'epoch': '0.1037'}
 10%|███████████████████▉                                                                                                                                                                            | 589/5680 [1:27:46<19:15:20, 13.62s/it] 10%|███████████████████▉                                                                                                                                                                            | 590/5680 [1:28:00<19:10:40, 13.56s/it]                                                                                                                                                                                                                                             {'loss': '0.7724', 'grad_norm': '0.2105', 'learning_rate': '0.0001947', 'ppl': '2.165', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595', 'tokens/total': 4833280, 'tokens/trainable': 4792016, 'epoch': '0.1039'}
 10%|███████████████████▉                                                                                                                                                                            | 590/5680 [1:28:00<19:10:40, 13.56s/it] 10%|███████████████████▉                                                                                                                                                                            | 591/5680 [1:28:13<18:56:38, 13.40s/it]                                                                                                                                                                                                                                             {'loss': '0.9486', 'grad_norm': '0.2296', 'learning_rate': '0.0001947', 'ppl': '2.582', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '624.5', 'tokens/total': 4841472, 'tokens/trainable': 4800141, 'epoch': '0.104'}
 10%|███████████████████▉                                                                                                                                                                            | 591/5680 [1:28:13<18:56:38, 13.40s/it] 10%|████████████████████                                                                                                                                                                            | 592/5680 [1:28:27<19:18:40, 13.66s/it]                                                                                                                                                                                                                                             {'loss': '0.806', 'grad_norm': '0.2185', 'learning_rate': '0.0001947', 'ppl': '2.239', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '557.7', 'tokens/total': 4849664, 'tokens/trainable': 4808100, 'epoch': '0.1042'}
 10%|████████████████████                                                                                                                                                                            | 592/5680 [1:28:27<19:18:40, 13.66s/it] 10%|████████████████████                                                                                                                                                                            | 593/5680 [1:28:41<19:33:16, 13.84s/it]                                                                                                                                                                                                                                             {'loss': '0.8978', 'grad_norm': '0.2455', 'learning_rate': '0.0001947', 'ppl': '2.454', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '549.4', 'tokens/total': 4857856, 'tokens/trainable': 4815924, 'epoch': '0.1044'}
 10%|████████████████████                                                                                                                                                                            | 593/5680 [1:28:41<19:33:16, 13.84s/it] 10%|████████████████████                                                                                                                                                                            | 594/5680 [1:28:56<19:44:50, 13.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4471', 'grad_norm': '0.2006', 'learning_rate': '0.0001947', 'ppl': '1.564', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '569.4', 'tokens/total': 4866048, 'tokens/trainable': 4824064, 'epoch': '0.1046'}
 10%|████████████████████                                                                                                                                                                            | 594/5680 [1:28:56<19:44:50, 13.98s/it] 10%|████████████████████                                                                                                                                                                            | 595/5680 [1:29:10<19:44:54, 13.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7304', 'grad_norm': '0.217', 'learning_rate': '0.0001947', 'ppl': '2.076', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '570.6', 'tokens/total': 4874240, 'tokens/trainable': 4832045, 'epoch': '0.1048'}
 10%|████████████████████                                                                                                                                                                            | 595/5680 [1:29:10<19:44:54, 13.98s/it] 10%|████████████████████▏                                                                                                                                                                           | 596/5680 [1:29:24<19:44:51, 13.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6035', 'grad_norm': '0.2044', 'learning_rate': '0.0001946', 'ppl': '1.829', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '572.1', 'tokens/total': 4882432, 'tokens/trainable': 4840043, 'epoch': '0.1049'}
 10%|████████████████████▏                                                                                                                                                                           | 596/5680 [1:29:24<19:44:51, 13.98s/it] 11%|████████████████████▏                                                                                                                                                                           | 597/5680 [1:29:37<19:30:26, 13.82s/it]                                                                                                                                                                                                                                             {'loss': '0.8964', 'grad_norm': '0.2303', 'learning_rate': '0.0001946', 'ppl': '2.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '607.1', 'tokens/total': 4890624, 'tokens/trainable': 4848186, 'epoch': '0.1051'}
 11%|████████████████████▏                                                                                                                                                                           | 597/5680 [1:29:37<19:30:26, 13.82s/it] 11%|████████████████████▏                                                                                                                                                                           | 598/5680 [1:29:51<19:26:01, 13.77s/it]                                                                                                                                                                                                                                             {'loss': '1.161', 'grad_norm': '0.2391', 'learning_rate': '0.0001946', 'ppl': '3.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '579.3', 'tokens/total': 4898816, 'tokens/trainable': 4856089, 'epoch': '0.1053'}
 11%|████████████████████▏                                                                                                                                                                           | 598/5680 [1:29:51<19:26:01, 13.77s/it] 11%|████████████████████▏                                                                                                                                                                           | 599/5680 [1:30:04<19:15:24, 13.64s/it]                                                                                                                                                                                                                                             {'loss': '0.7513', 'grad_norm': '0.2237', 'learning_rate': '0.0001946', 'ppl': '2.12', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '610.9', 'tokens/total': 4907008, 'tokens/trainable': 4864244, 'epoch': '0.1055'}
 11%|████████████████████▏                                                                                                                                                                           | 599/5680 [1:30:04<19:15:24, 13.64s/it] 11%|████████████████████▎                                                                                                                                                                           | 600/5680 [1:30:17<19:10:20, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.5719', 'grad_norm': '0.2005', 'learning_rate': '0.0001946', 'ppl': '1.772', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600.9', 'tokens/total': 4915200, 'tokens/trainable': 4872321, 'epoch': '0.1056'}
 11%|████████████████████▎                                                                                                                                                                           | 600/5680 [1:30:17<19:10:20, 13.59s/it] 11%|████████████████████▎                                                                                                                                                                           | 601/5680 [1:30:31<19:06:39, 13.55s/it]                                                                                                                                                                                                                                             {'loss': '0.609', 'grad_norm': '0.2055', 'learning_rate': '0.0001945', 'ppl': '1.839', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '577', 'tokens/total': 4923392, 'tokens/trainable': 4880077, 'epoch': '0.1058'}
 11%|████████████████████▎                                                                                                                                                                           | 601/5680 [1:30:31<19:06:39, 13.55s/it] 11%|████████████████████▎                                                                                                                                                                           | 602/5680 [1:30:44<19:06:09, 13.54s/it]                                                                                                                                                                                                                                             {'loss': '0.7856', 'grad_norm': '0.2245', 'learning_rate': '0.0001945', 'ppl': '2.194', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '594.1', 'tokens/total': 4931584, 'tokens/trainable': 4888113, 'epoch': '0.106'}
 11%|████████████████████▎                                                                                                                                                                           | 602/5680 [1:30:44<19:06:09, 13.54s/it] 11%|████████████████████▍                                                                                                                                                                           | 603/5680 [1:30:58<19:13:05, 13.63s/it]                                                                                                                                                                                                                                             {'loss': '0.7551', 'grad_norm': '0.2256', 'learning_rate': '0.0001945', 'ppl': '2.128', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '577.7', 'tokens/total': 4939776, 'tokens/trainable': 4896093, 'epoch': '0.1062'}
 11%|████████████████████▍                                                                                                                                                                           | 603/5680 [1:30:58<19:13:05, 13.63s/it] 11%|████████████████████▍                                                                                                                                                                           | 604/5680 [1:31:12<19:20:56, 13.72s/it]                                                                                                                                                                                                                                             {'loss': '0.6693', 'grad_norm': '0.1963', 'learning_rate': '0.0001945', 'ppl': '1.953', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '583.2', 'tokens/total': 4947968, 'tokens/trainable': 4904223, 'epoch': '0.1063'}
 11%|████████████████████▍                                                                                                                                                                           | 604/5680 [1:31:12<19:20:56, 13.72s/it] 11%|████████████████████▍                                                                                                                                                                           | 605/5680 [1:31:26<19:24:45, 13.77s/it]                                                                                                                                                                                                                                             {'loss': '0.6146', 'grad_norm': '0.2198', 'learning_rate': '0.0001945', 'ppl': '1.849', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '572.4', 'tokens/total': 4956160, 'tokens/trainable': 4912167, 'epoch': '0.1065'}
 11%|████████████████████▍                                                                                                                                                                           | 605/5680 [1:31:26<19:24:45, 13.77s/it] 11%|████████████████████▍                                                                                                                                                                           | 606/5680 [1:31:40<19:25:22, 13.78s/it]                                                                                                                                                                                                                                             {'loss': '0.6683', 'grad_norm': '0.2241', 'learning_rate': '0.0001945', 'ppl': '1.951', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '577.7', 'tokens/total': 4964352, 'tokens/trainable': 4920140, 'epoch': '0.1067'}
 11%|████████████████████▍                                                                                                                                                                           | 606/5680 [1:31:40<19:25:22, 13.78s/it] 11%|████████████████████▌                                                                                                                                                                           | 607/5680 [1:31:54<19:22:28, 13.75s/it]                                                                                                                                                                                                                                             {'loss': '0.8385', 'grad_norm': '0.2377', 'learning_rate': '0.0001944', 'ppl': '2.313', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.4', 'tokens/total': 4972544, 'tokens/trainable': 4928226, 'epoch': '0.1069'}
 11%|████████████████████▌                                                                                                                                                                           | 607/5680 [1:31:54<19:22:28, 13.75s/it] 11%|████████████████████▌                                                                                                                                                                           | 608/5680 [1:32:08<19:27:34, 13.81s/it]                                                                                                                                                                                                                                             {'loss': '0.6465', 'grad_norm': '0.216', 'learning_rate': '0.0001944', 'ppl': '1.909', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '572.4', 'tokens/total': 4980736, 'tokens/trainable': 4936213, 'epoch': '0.107'}
 11%|████████████████████▌                                                                                                                                                                           | 608/5680 [1:32:08<19:27:34, 13.81s/it] 11%|████████████████████▌                                                                                                                                                                           | 609/5680 [1:32:21<19:28:06, 13.82s/it]                                                                                                                                                                                                                                             {'loss': '0.4992', 'grad_norm': '0.2562', 'learning_rate': '0.0001944', 'ppl': '1.647', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '554.5', 'tokens/total': 4988928, 'tokens/trainable': 4943885, 'epoch': '0.1072'}
 11%|████████████████████▌                                                                                                                                                                           | 609/5680 [1:32:21<19:28:06, 13.82s/it] 11%|████████████████████▌                                                                                                                                                                           | 610/5680 [1:32:35<19:31:12, 13.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8499', 'grad_norm': '0.2413', 'learning_rate': '0.0001944', 'ppl': '2.339', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '554.2', 'tokens/total': 4997120, 'tokens/trainable': 4951613, 'epoch': '0.1074'}
 11%|████████████████████▌                                                                                                                                                                           | 610/5680 [1:32:35<19:31:12, 13.86s/it] 11%|████████████████████▋                                                                                                                                                                           | 611/5680 [1:32:49<19:25:55, 13.80s/it]                                                                                                                                                                                                                                             {'loss': '0.7946', 'grad_norm': '0.2213', 'learning_rate': '0.0001944', 'ppl': '2.214', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.6', 'tokens/total': 5005312, 'tokens/trainable': 4959665, 'epoch': '0.1076'}
 11%|████████████████████▋                                                                                                                                                                           | 611/5680 [1:32:49<19:25:55, 13.80s/it] 11%|████████████████████▋                                                                                                                                                                           | 612/5680 [1:33:02<19:18:00, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.6925', 'grad_norm': '0.2242', 'learning_rate': '0.0001943', 'ppl': '1.999', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600.6', 'tokens/total': 5013504, 'tokens/trainable': 4967765, 'epoch': '0.1077'}
 11%|████████████████████▋                                                                                                                                                                           | 612/5680 [1:33:02<19:18:00, 13.71s/it] 11%|████████████████████▋                                                                                                                                                                           | 613/5680 [1:33:16<19:11:33, 13.64s/it]                                                                                                                                                                                                                                             {'loss': '0.7057', 'grad_norm': '0.218', 'learning_rate': '0.0001943', 'ppl': '2.025', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.2', 'tokens/total': 5021696, 'tokens/trainable': 4975733, 'epoch': '0.1079'}
 11%|████████████████████▋                                                                                                                                                                           | 613/5680 [1:33:16<19:11:33, 13.64s/it] 11%|████████████████████▊                                                                                                                                                                           | 614/5680 [1:33:30<19:12:09, 13.65s/it]                                                                                                                                                                                                                                             {'loss': '0.7104', 'grad_norm': '0.2308', 'learning_rate': '0.0001943', 'ppl': '2.035', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.8', 'tokens/total': 5029888, 'tokens/trainable': 4983815, 'epoch': '0.1081'}
 11%|████████████████████▊                                                                                                                                                                           | 614/5680 [1:33:30<19:12:09, 13.65s/it] 11%|████████████████████▊                                                                                                                                                                           | 615/5680 [1:33:43<19:05:11, 13.57s/it]                                                                                                                                                                                                                                             {'loss': '0.7579', 'grad_norm': '0.221', 'learning_rate': '0.0001943', 'ppl': '2.134', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '611.4', 'tokens/total': 5038080, 'tokens/trainable': 4991990, 'epoch': '0.1083'}
 11%|████████████████████▊                                                                                                                                                                           | 615/5680 [1:33:43<19:05:11, 13.57s/it] 11%|████████████████████▊                                                                                                                                                                           | 616/5680 [1:33:56<19:03:28, 13.55s/it]                                                                                                                                                                                                                                             {'loss': '0.6135', 'grad_norm': '0.2107', 'learning_rate': '0.0001943', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '602.6', 'tokens/total': 5046272, 'tokens/trainable': 5000123, 'epoch': '0.1085'}
 11%|████████████████████▊                                                                                                                                                                           | 616/5680 [1:33:56<19:03:28, 13.55s/it] 11%|████████████████████▊                                                                                                                                                                           | 617/5680 [1:34:10<19:00:01, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.5545', 'grad_norm': '0.1954', 'learning_rate': '0.0001943', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '608.9', 'tokens/total': 5054464, 'tokens/trainable': 5008290, 'epoch': '0.1086'}
 11%|████████████████████▊                                                                                                                                                                           | 617/5680 [1:34:10<19:00:01, 13.51s/it] 11%|████████████████████▉                                                                                                                                                                           | 618/5680 [1:34:23<18:57:49, 13.49s/it]                                                                                                                                                                                                                                             {'loss': '0.951', 'grad_norm': '0.2712', 'learning_rate': '0.0001942', 'ppl': '2.588', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.7', 'tokens/total': 5062656, 'tokens/trainable': 5016178, 'epoch': '0.1088'}
 11%|████████████████████▉                                                                                                                                                                           | 618/5680 [1:34:23<18:57:49, 13.49s/it] 11%|████████████████████▉                                                                                                                                                                           | 619/5680 [1:34:37<19:06:57, 13.60s/it]                                                                                                                                                                                                                                             {'loss': '0.8515', 'grad_norm': '0.2533', 'learning_rate': '0.0001942', 'ppl': '2.343', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '571.1', 'tokens/total': 5070848, 'tokens/trainable': 5024084, 'epoch': '0.109'}
 11%|████████████████████▉                                                                                                                                                                           | 619/5680 [1:34:37<19:06:57, 13.60s/it] 11%|████████████████████▉                                                                                                                                                                           | 620/5680 [1:34:51<19:15:10, 13.70s/it]                                                                                                                                                                                                                                             {'loss': '0.7264', 'grad_norm': '0.2219', 'learning_rate': '0.0001942', 'ppl': '2.068', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '559.3', 'tokens/total': 5079040, 'tokens/trainable': 5031871, 'epoch': '0.1092'}
 11%|████████████████████▉                                                                                                                                                                           | 620/5680 [1:34:51<19:15:10, 13.70s/it] 11%|████████████████████▉                                                                                                                                                                           | 621/5680 [1:35:05<19:24:48, 13.81s/it]                                                                                                                                                                                                                                             {'loss': '0.8793', 'grad_norm': '0.2315', 'learning_rate': '0.0001942', 'ppl': '2.409', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '578', 'tokens/total': 5087232, 'tokens/trainable': 5040013, 'epoch': '0.1093'}
 11%|████████████████████▉                                                                                                                                                                           | 621/5680 [1:35:05<19:24:48, 13.81s/it] 11%|█████████████████████                                                                                                                                                                           | 622/5680 [1:35:19<19:36:33, 13.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7867', 'grad_norm': '0.2348', 'learning_rate': '0.0001942', 'ppl': '2.196', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '552.4', 'tokens/total': 5095424, 'tokens/trainable': 5047901, 'epoch': '0.1095'}
 11%|█████████████████████                                                                                                                                                                           | 622/5680 [1:35:19<19:36:33, 13.96s/it] 11%|█████████████████████                                                                                                                                                                           | 623/5680 [1:35:34<19:37:53, 13.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6943', 'grad_norm': '0.2568', 'learning_rate': '0.0001941', 'ppl': '2.002', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '579.8', 'tokens/total': 5103616, 'tokens/trainable': 5056028, 'epoch': '0.1097'}
 11%|█████████████████████                                                                                                                                                                           | 623/5680 [1:35:34<19:37:53, 13.98s/it] 11%|█████████████████████                                                                                                                                                                           | 624/5680 [1:35:48<19:41:01, 14.02s/it]                                                                                                                                                                                                                                             {'loss': '1.07', 'grad_norm': '0.2671', 'learning_rate': '0.0001941', 'ppl': '2.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '574.5', 'tokens/total': 5111808, 'tokens/trainable': 5064128, 'epoch': '0.1099'}
 11%|█████████████████████                                                                                                                                                                           | 624/5680 [1:35:48<19:41:01, 14.02s/it] 11%|█████████████████████▏                                                                                                                                                                          | 625/5680 [1:36:01<19:29:06, 13.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9664', 'grad_norm': '0.2561', 'learning_rate': '0.0001941', 'ppl': '2.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '574.1', 'tokens/total': 5120000, 'tokens/trainable': 5071903, 'epoch': '0.11'}
 11%|█████████████████████▏                                                                                                                                                                          | 625/5680 [1:36:01<19:29:06, 13.88s/it] 11%|█████████████████████▏                                                                                                                                                                          | 626/5680 [1:36:15<19:20:33, 13.78s/it]                                                                                                                                                                                                                                             {'loss': '0.4574', 'grad_norm': '0.2019', 'learning_rate': '0.0001941', 'ppl': '1.58', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.3', 'tokens/total': 5128192, 'tokens/trainable': 5079923, 'epoch': '0.1102'}
 11%|█████████████████████▏                                                                                                                                                                          | 626/5680 [1:36:15<19:20:33, 13.78s/it] 11%|█████████████████████▏                                                                                                                                                                          | 627/5680 [1:36:28<19:12:34, 13.69s/it]                                                                                                                                                                                                                                             {'loss': '0.6567', 'grad_norm': '0.2184', 'learning_rate': '0.0001941', 'ppl': '1.928', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.6', 'tokens/total': 5136384, 'tokens/trainable': 5087806, 'epoch': '0.1104'}
 11%|█████████████████████▏                                                                                                                                                                          | 627/5680 [1:36:28<19:12:34, 13.69s/it] 11%|█████████████████████▏                                                                                                                                                                          | 628/5680 [1:36:42<19:04:27, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '1.213', 'grad_norm': '0.2679', 'learning_rate': '0.000194', 'ppl': '3.364', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.6', 'tokens/total': 5144576, 'tokens/trainable': 5095699, 'epoch': '0.1106'}
 11%|█████████████████████▏                                                                                                                                                                          | 628/5680 [1:36:42<19:04:27, 13.59s/it] 11%|█████████████████████▎                                                                                                                                                                          | 629/5680 [1:36:55<19:05:58, 13.61s/it]                                                                                                                                                                                                                                             {'loss': '0.6665', 'grad_norm': '0.2291', 'learning_rate': '0.000194', 'ppl': '1.947', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.2', 'tokens/total': 5152768, 'tokens/trainable': 5103715, 'epoch': '0.1107'}
 11%|█████████████████████▎                                                                                                                                                                          | 629/5680 [1:36:55<19:05:58, 13.61s/it] 11%|█████████████████████▎                                                                                                                                                                          | 630/5680 [1:37:09<18:58:10, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.6002', 'grad_norm': '0.2057', 'learning_rate': '0.000194', 'ppl': '1.823', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595.5', 'tokens/total': 5160960, 'tokens/trainable': 5111637, 'epoch': '0.1109'}
 11%|█████████████████████▎                                                                                                                                                                          | 630/5680 [1:37:09<18:58:10, 13.52s/it] 11%|█████████████████████▎                                                                                                                                                                          | 631/5680 [1:37:22<18:54:11, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.6942', 'grad_norm': '0.2125', 'learning_rate': '0.000194', 'ppl': '2.002', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '610.1', 'tokens/total': 5169152, 'tokens/trainable': 5119791, 'epoch': '0.1111'}
 11%|█████████████████████▎                                                                                                                                                                          | 631/5680 [1:37:22<18:54:11, 13.48s/it] 11%|█████████████████████▎                                                                                                                                                                          | 632/5680 [1:37:35<18:47:24, 13.40s/it]                                                                                                                                                                                                                                             {'loss': '0.5327', 'grad_norm': '0.1876', 'learning_rate': '0.000194', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '605', 'tokens/total': 5177344, 'tokens/trainable': 5127782, 'epoch': '0.1113'}
 11%|█████████████████████▎                                                                                                                                                                          | 632/5680 [1:37:35<18:47:24, 13.40s/it] 11%|█████████████████████▍                                                                                                                                                                          | 633/5680 [1:37:48<18:44:19, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.6406', 'grad_norm': '0.1951', 'learning_rate': '0.000194', 'ppl': '1.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '571.6', 'tokens/total': 5185536, 'tokens/trainable': 5135371, 'epoch': '0.1114'}
 11%|█████████████████████▍                                                                                                                                                                          | 633/5680 [1:37:48<18:44:19, 13.37s/it] 11%|█████████████████████▍                                                                                                                                                                          | 634/5680 [1:38:02<18:46:23, 13.39s/it]                                                                                                                                                                                                                                             {'loss': '0.8061', 'grad_norm': '0.2362', 'learning_rate': '0.0001939', 'ppl': '2.239', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596.3', 'tokens/total': 5193728, 'tokens/trainable': 5143389, 'epoch': '0.1116'}
 11%|█████████████████████▍                                                                                                                                                                          | 634/5680 [1:38:02<18:46:23, 13.39s/it] 11%|█████████████████████▍                                                                                                                                                                          | 635/5680 [1:38:16<18:56:16, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.782', 'grad_norm': '0.2268', 'learning_rate': '0.0001939', 'ppl': '2.186', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.6', 'tokens/total': 5201920, 'tokens/trainable': 5151483, 'epoch': '0.1118'}
 11%|█████████████████████▍                                                                                                                                                                          | 635/5680 [1:38:16<18:56:16, 13.51s/it] 11%|█████████████████████▍                                                                                                                                                                          | 636/5680 [1:38:29<18:58:36, 13.54s/it]                                                                                                                                                                                                                                             {'loss': '0.5639', 'grad_norm': '0.2017', 'learning_rate': '0.0001939', 'ppl': '1.757', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '593.1', 'tokens/total': 5210112, 'tokens/trainable': 5159553, 'epoch': '0.112'}
 11%|█████████████████████▍                                                                                                                                                                          | 636/5680 [1:38:29<18:58:36, 13.54s/it] 11%|█████████████████████▌                                                                                                                                                                          | 637/5680 [1:38:43<19:09:15, 13.67s/it]                                                                                                                                                                                                                                             {'loss': '0.7135', 'grad_norm': '0.2433', 'learning_rate': '0.0001939', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '559.6', 'tokens/total': 5218304, 'tokens/trainable': 5167365, 'epoch': '0.1121'}
 11%|█████████████████████▌                                                                                                                                                                          | 637/5680 [1:38:43<19:09:15, 13.67s/it] 11%|█████████████████████▌                                                                                                                                                                          | 638/5680 [1:38:57<19:13:07, 13.72s/it]                                                                                                                                                                                                                                             {'loss': '0.932', 'grad_norm': '0.2357', 'learning_rate': '0.0001939', 'ppl': '2.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '563.7', 'tokens/total': 5226496, 'tokens/trainable': 5175161, 'epoch': '0.1123'}
 11%|█████████████████████▌                                                                                                                                                                          | 638/5680 [1:38:57<19:13:07, 13.72s/it] 11%|█████████████████████▌                                                                                                                                                                          | 639/5680 [1:39:11<19:08:29, 13.67s/it]                                                                                                                                                                                                                                             {'loss': '0.768', 'grad_norm': '0.2277', 'learning_rate': '0.0001938', 'ppl': '2.155', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598.6', 'tokens/total': 5234688, 'tokens/trainable': 5183268, 'epoch': '0.1125'}
 11%|█████████████████████▌                                                                                                                                                                          | 639/5680 [1:39:11<19:08:29, 13.67s/it] 11%|█████████████████████▋                                                                                                                                                                          | 640/5680 [1:39:24<19:12:03, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.762', 'grad_norm': '0.2283', 'learning_rate': '0.0001938', 'ppl': '2.143', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '571.2', 'tokens/total': 5242880, 'tokens/trainable': 5191161, 'epoch': '0.1127'}
 11%|█████████████████████▋                                                                                                                                                                          | 640/5680 [1:39:24<19:12:03, 13.71s/it] 11%|█████████████████████▋                                                                                                                                                                          | 641/5680 [1:39:38<19:14:34, 13.75s/it]                                                                                                                                                                                                                                             {'loss': '0.7485', 'grad_norm': '0.2017', 'learning_rate': '0.0001938', 'ppl': '2.114', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.4', 'tokens/total': 5251072, 'tokens/trainable': 5199305, 'epoch': '0.1129'}
 11%|█████████████████████▋                                                                                                                                                                          | 641/5680 [1:39:38<19:14:34, 13.75s/it] 11%|█████████████████████▋                                                                                                                                                                          | 642/5680 [1:39:53<19:27:02, 13.90s/it]                                                                                                                                                                                                                                             {'loss': '0.609', 'grad_norm': '0.234', 'learning_rate': '0.0001938', 'ppl': '1.839', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '567.1', 'tokens/total': 5259264, 'tokens/trainable': 5207385, 'epoch': '0.113'}
 11%|█████████████████████▋                                                                                                                                                                          | 642/5680 [1:39:53<19:27:02, 13.90s/it] 11%|█████████████████████▋                                                                                                                                                                          | 643/5680 [1:40:07<19:32:20, 13.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4462', 'grad_norm': '0.1932', 'learning_rate': '0.0001938', 'ppl': '1.562', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '550.9', 'tokens/total': 5267456, 'tokens/trainable': 5215162, 'epoch': '0.1132'}
 11%|█████████████████████▋                                                                                                                                                                          | 643/5680 [1:40:07<19:32:20, 13.96s/it] 11%|█████████████████████▊                                                                                                                                                                          | 644/5680 [1:40:21<19:31:28, 13.96s/it]                                                                                                                                                                                                                                             {'loss': '0.9079', 'grad_norm': '0.2503', 'learning_rate': '0.0001937', 'ppl': '2.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '575', 'tokens/total': 5275648, 'tokens/trainable': 5223172, 'epoch': '0.1134'}
 11%|█████████████████████▊                                                                                                                                                                          | 644/5680 [1:40:21<19:31:28, 13.96s/it] 11%|█████████████████████▊                                                                                                                                                                          | 645/5680 [1:40:34<19:25:09, 13.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6994', 'grad_norm': '0.2376', 'learning_rate': '0.0001937', 'ppl': '2.013', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588.5', 'tokens/total': 5283840, 'tokens/trainable': 5231235, 'epoch': '0.1136'}
 11%|█████████████████████▊                                                                                                                                                                          | 645/5680 [1:40:34<19:25:09, 13.88s/it] 11%|█████████████████████▊                                                                                                                                                                          | 646/5680 [1:40:48<19:16:43, 13.79s/it]                                                                                                                                                                                                                                             {'loss': '0.8895', 'grad_norm': '0.254', 'learning_rate': '0.0001937', 'ppl': '2.434', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '563.5', 'tokens/total': 5292032, 'tokens/trainable': 5238870, 'epoch': '0.1137'}
 11%|█████████████████████▊                                                                                                                                                                          | 646/5680 [1:40:48<19:16:43, 13.79s/it] 11%|█████████████████████▊                                                                                                                                                                          | 647/5680 [1:41:01<19:02:36, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.7456', 'grad_norm': '0.2054', 'learning_rate': '0.0001937', 'ppl': '2.108', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '608.6', 'tokens/total': 5300224, 'tokens/trainable': 5246920, 'epoch': '0.1139'}
 11%|█████████████████████▊                                                                                                                                                                          | 647/5680 [1:41:01<19:02:36, 13.62s/it] 11%|█████████████████████▉                                                                                                                                                                          | 648/5680 [1:41:15<18:59:34, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.642', 'grad_norm': '0.1925', 'learning_rate': '0.0001937', 'ppl': '1.9', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '604.3', 'tokens/total': 5308416, 'tokens/trainable': 5255079, 'epoch': '0.1141'}
 11%|█████████████████████▉                                                                                                                                                                          | 648/5680 [1:41:15<18:59:34, 13.59s/it] 11%|█████████████████████▉                                                                                                                                                                          | 649/5680 [1:41:29<19:09:37, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.6899', 'grad_norm': '0.2561', 'learning_rate': '0.0001936', 'ppl': '1.993', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '573.5', 'tokens/total': 5316608, 'tokens/trainable': 5263105, 'epoch': '0.1143'}
 11%|█████████████████████▉                                                                                                                                                                          | 649/5680 [1:41:29<19:09:37, 13.71s/it] 11%|█████████████████████▉                                                                                                                                                                          | 650/5680 [1:41:43<19:16:12, 13.79s/it]                                                                                                                                                                                                                                             {'loss': '1.184', 'grad_norm': '0.263', 'learning_rate': '0.0001936', 'ppl': '3.266', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '580.9', 'tokens/total': 5324800, 'tokens/trainable': 5271224, 'epoch': '0.1144'}
 11%|█████████████████████▉                                                                                                                                                                          | 650/5680 [1:41:43<19:16:12, 13.79s/it] 11%|██████████████████████                                                                                                                                                                          | 651/5680 [1:41:57<19:21:17, 13.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7779', 'grad_norm': '0.2378', 'learning_rate': '0.0001936', 'ppl': '2.177', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '563.6', 'tokens/total': 5332992, 'tokens/trainable': 5279115, 'epoch': '0.1146'}
 11%|██████████████████████                                                                                                                                                                          | 651/5680 [1:41:57<19:21:17, 13.86s/it] 11%|██████████████████████                                                                                                                                                                          | 652/5680 [1:42:11<19:29:07, 13.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8476', 'grad_norm': '0.223', 'learning_rate': '0.0001936', 'ppl': '2.334', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '557.8', 'tokens/total': 5341184, 'tokens/trainable': 5287017, 'epoch': '0.1148'}
 11%|██████████████████████                                                                                                                                                                          | 652/5680 [1:42:11<19:29:07, 13.95s/it] 11%|██████████████████████                                                                                                                                                                          | 653/5680 [1:42:25<19:33:35, 14.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7355', 'grad_norm': '0.2613', 'learning_rate': '0.0001936', 'ppl': '2.087', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '564.6', 'tokens/total': 5349376, 'tokens/trainable': 5294995, 'epoch': '0.115'}
 11%|██████████████████████                                                                                                                                                                          | 653/5680 [1:42:25<19:33:35, 14.01s/it] 12%|██████████████████████                                                                                                                                                                          | 654/5680 [1:42:38<19:17:58, 13.82s/it]                                                                                                                                                                                                                                             {'loss': '0.7227', 'grad_norm': '0.2384', 'learning_rate': '0.0001935', 'ppl': '2.06', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598', 'tokens/total': 5357568, 'tokens/trainable': 5302998, 'epoch': '0.1151'}
 12%|██████████████████████                                                                                                                                                                          | 654/5680 [1:42:38<19:17:58, 13.82s/it] 12%|██████████████████████▏                                                                                                                                                                         | 655/5680 [1:42:52<19:05:16, 13.68s/it]                                                                                                                                                                                                                                             {'loss': '0.8739', 'grad_norm': '0.241', 'learning_rate': '0.0001935', 'ppl': '2.396', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596.4', 'tokens/total': 5365760, 'tokens/trainable': 5310940, 'epoch': '0.1153'}
 12%|██████████████████████▏                                                                                                                                                                         | 655/5680 [1:42:52<19:05:16, 13.68s/it] 12%|██████████████████████▏                                                                                                                                                                         | 656/5680 [1:43:05<19:04:09, 13.66s/it]                                                                                                                                                                                                                                             {'loss': '0.5045', 'grad_norm': '0.2373', 'learning_rate': '0.0001935', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '553.5', 'tokens/total': 5373952, 'tokens/trainable': 5318484, 'epoch': '0.1155'}
 12%|██████████████████████▏                                                                                                                                                                         | 656/5680 [1:43:05<19:04:09, 13.66s/it] 12%|██████████████████████▏                                                                                                                                                                         | 657/5680 [1:43:19<18:57:08, 13.58s/it]                                                                                                                                                                                                                                             {'loss': '0.8313', 'grad_norm': '0.2503', 'learning_rate': '0.0001935', 'ppl': '2.296', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '581.6', 'tokens/total': 5382144, 'tokens/trainable': 5326268, 'epoch': '0.1157'}
 12%|██████████████████████▏                                                                                                                                                                         | 657/5680 [1:43:19<18:57:08, 13.58s/it] 12%|██████████████████████▏                                                                                                                                                                         | 658/5680 [1:43:33<19:07:28, 13.71s/it]                                                                                                                                                                                                                                             {'loss': '0.7314', 'grad_norm': '0.2335', 'learning_rate': '0.0001935', 'ppl': '2.078', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '570.6', 'tokens/total': 5390336, 'tokens/trainable': 5334254, 'epoch': '0.1158'}
 12%|██████████████████████▏                                                                                                                                                                         | 658/5680 [1:43:33<19:07:28, 13.71s/it] 12%|██████████████████████▎                                                                                                                                                                         | 659/5680 [1:43:49<20:06:22, 14.42s/it]                                                                                                                                                                                                                                             {'loss': '0.8021', 'grad_norm': '0.2619', 'learning_rate': '0.0001935', 'ppl': '2.23', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '522.4', 'tokens/total': 5398528, 'tokens/trainable': 5341416, 'epoch': '0.116'}
 12%|██████████████████████▎                                                                                                                                                                         | 659/5680 [1:43:49<20:06:22, 14.42s/it][2026-01-26 23:33:02,923] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:58791] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-26 23:33:04,174] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:58791] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<03:51, 24.10 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:44, 52.42 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:05<01:03, 83.88 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:44, 116.82 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:33, 154.06 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:06<00:26, 191.44 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:06<00:22, 215.11 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:20, 236.75 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:07<00:18, 257.96 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:07<00:17, 267.66 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:14, 308.61 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:08<00:16, 272.19 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:08<00:15, 274.71 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:12, 332.36 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:09<00:14, 273.77 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:09<00:14, 280.29 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:10<00:14, 272.24 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:10<00:11, 333.47 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:10<00:12, 288.46 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:11<00:12, 288.70 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:11<00:10, 333.57 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:11<00:12, 278.21 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:12<00:11, 286.92 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:12<00:10, 289.34 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:12<00:10, 291.01 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:13<00:10, 292.45 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:13<00:09, 296.56 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:13<00:08, 306.36 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:14<00:08, 300.73 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:14<00:08, 305.54 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:14<00:07, 312.29 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:15<00:07, 310.20 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:15<00:07, 309.37 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:15<00:06, 308.63 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:16<00:06, 309.39 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:16<00:06, 308.55 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:16<00:05, 306.57 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:17<00:05, 306.29 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:17<00:05, 305.33 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:17<00:04, 307.37 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:18<00:03, 347.84 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:18<00:04, 302.91 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:18<00:03, 296.77 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:19<00:03, 304.42 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:19<00:02, 346.75 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:19<00:02, 289.85 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:20<00:02, 293.45 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:20<00:01, 333.67 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:20<00:01, 294.76 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:21<00:01, 297.29 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:21<00:00, 337.53 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:21<00:00, 295.20 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:22<00:00, 332.73 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:22<00:00, 333.58 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:23<00:00, 245.09 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 902.82 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1283.19 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1543.48 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1612.00 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1734.21 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1758.30 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1570.31 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1414.08 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2084.88 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2379.17 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2596.73 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2675.44 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2393.11 examples/s]
[2026-01-26 23:33:33,699] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:58791] Using single process for pack_parallel, running sequentially.
[2026-01-26 23:33:39,676] [WARNING] [py.warnings._showwarnmsg:109] [PID:58791] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 12%|██████████████████████▎                                                                                                                                                                         | 660/5680 [1:44:39<34:59:18, 25.09s/it]                                                                                                                                                                                                                                             {'loss': '1.345', 'grad_norm': '0.414', 'learning_rate': '0.0001934', 'ppl': '3.839', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '375.9', 'tokens/total': 5406720, 'tokens/trainable': 5346338, 'epoch': '1'}
 12%|██████████████████████▎                                                                                                                                                                         | 660/5680 [1:44:39<34:59:18, 25.09s/it][2026-01-26 23:33:52,910] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:59030] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-26 23:33:54,081] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:59030] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<03:45, 24.69 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:37, 56.10 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:04<01:02, 85.16 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:44, 119.02 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:33, 153.85 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:05<00:27, 185.91 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:06<00:23, 211.32 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:20, 233.94 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:07<00:18, 253.54 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:07<00:15, 297.52 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:16, 266.99 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:08<00:15, 279.09 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:08<00:14, 288.02 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:14, 293.52 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:09<00:13, 304.16 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:09<00:13, 301.09 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:09<00:12, 305.63 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:09<00:11, 342.11 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:10<00:12, 295.92 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:10<00:12, 297.06 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:11<00:11, 300.52 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:11<00:11, 301.51 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:11<00:10, 305.38 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:12<00:10, 306.98 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:12<00:10, 299.30 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:12<00:09, 302.27 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:13<00:09, 303.14 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:13<00:09, 302.87 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:13<00:08, 297.34 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:14<00:08, 301.48 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:14<00:07, 304.13 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:14<00:07, 311.63 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:15<00:07, 311.41 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:15<00:06, 315.29 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:15<00:06, 322.99 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:16<00:05, 320.98 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:16<00:05, 319.28 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:16<00:05, 322.85 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:17<00:04, 317.42 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:17<00:04, 301.22 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:17<00:04, 327.57 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:18<00:03, 368.86 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:18<00:03, 319.93 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:18<00:03, 323.08 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:19<00:02, 319.88 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:19<00:02, 318.92 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:19<00:02, 317.61 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:20<00:01, 316.04 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:20<00:01, 318.25 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:20<00:01, 318.16 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:21<00:00, 319.54 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:21<00:00, 339.41 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:21<00:00, 332.08 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:22<00:00, 338.29 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:22<00:00, 251.14 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 918.60 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1287.95 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1479.14 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1612.74 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1626.45 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1644.14 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1509.49 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1332.96 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1953.91 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2262.43 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2542.67 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2615.93 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2308.17 examples/s]
[2026-01-26 23:34:28,823] [WARNING] [py.warnings._showwarnmsg:109] [PID:59030] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 12%|██████████████████████▎                                                                                                                                                                         | 661/5680 [1:45:28<45:05:34, 32.34s/it]                                                                                                                                                                                                                                             {'loss': '0.6992', 'grad_norm': '0.2144', 'learning_rate': '0.0001934', 'ppl': '2.012', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '165.9', 'tokens/total': 5414912, 'tokens/trainable': 5354509, 'epoch': '1'}
 12%|██████████████████████▎                                                                                                                                                                         | 661/5680 [1:45:28<45:05:34, 32.34s/it] 12%|██████████████████████▍                                                                                                                                                                         | 662/5680 [1:45:41<37:02:39, 26.58s/it]                                                                                                                                                                                                                                             {'loss': '0.6679', 'grad_norm': '0.2081', 'learning_rate': '0.0001934', 'ppl': '1.95', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625.9', 'tokens/total': 5423104, 'tokens/trainable': 5362672, 'epoch': '1.001'}
 12%|██████████████████████▍                                                                                                                                                                         | 662/5680 [1:45:41<37:02:39, 26.58s/it] 12%|██████████████████████▍                                                                                                                                                                         | 663/5680 [1:45:54<31:22:14, 22.51s/it]                                                                                                                                                                                                                                             {'loss': '0.5636', 'grad_norm': '0.1966', 'learning_rate': '0.0001934', 'ppl': '1.757', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625.4', 'tokens/total': 5431296, 'tokens/trainable': 5370809, 'epoch': '1.001'}
 12%|██████████████████████▍                                                                                                                                                                         | 663/5680 [1:45:54<31:22:14, 22.51s/it] 12%|██████████████████████▍                                                                                                                                                                         | 664/5680 [1:46:08<27:42:29, 19.89s/it]                                                                                                                                                                                                                                             {'loss': '0.707', 'grad_norm': '0.2122', 'learning_rate': '0.0001934', 'ppl': '2.028', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.3', 'tokens/total': 5439488, 'tokens/trainable': 5378927, 'epoch': '1.001'}
 12%|██████████████████████▍                                                                                                                                                                         | 664/5680 [1:46:08<27:42:29, 19.89s/it] 12%|██████████████████████▍                                                                                                                                                                         | 665/5680 [1:46:22<25:09:06, 18.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7659', 'grad_norm': '0.2177', 'learning_rate': '0.0001933', 'ppl': '2.151', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '593.9', 'tokens/total': 5447680, 'tokens/trainable': 5387107, 'epoch': '1.001'}
 12%|██████████████████████▍                                                                                                                                                                         | 665/5680 [1:46:22<25:09:06, 18.06s/it] 12%|██████████████████████▌                                                                                                                                                                         | 666/5680 [1:46:36<23:23:05, 16.79s/it]                                                                                                                                                                                                                                             {'loss': '0.8413', 'grad_norm': '0.2186', 'learning_rate': '0.0001933', 'ppl': '2.319', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.5', 'tokens/total': 5455872, 'tokens/trainable': 5395231, 'epoch': '1.001'}
 12%|██████████████████████▌                                                                                                                                                                         | 666/5680 [1:46:36<23:23:05, 16.79s/it] 12%|██████████████████████▌                                                                                                                                                                         | 667/5680 [1:46:49<21:54:55, 15.74s/it]                                                                                                                                                                                                                                             {'loss': '0.8188', 'grad_norm': '0.2494', 'learning_rate': '0.0001933', 'ppl': '2.268', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '615.5', 'tokens/total': 5464064, 'tokens/trainable': 5403402, 'epoch': '1.001'}
 12%|██████████████████████▌                                                                                                                                                                         | 667/5680 [1:46:49<21:54:55, 15.74s/it] 12%|██████████████████████▌                                                                                                                                                                         | 668/5680 [1:47:02<20:47:04, 14.93s/it]                                                                                                                                                                                                                                             {'loss': '0.9621', 'grad_norm': '0.249', 'learning_rate': '0.0001933', 'ppl': '2.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '627.3', 'tokens/total': 5472256, 'tokens/trainable': 5411572, 'epoch': '1.002'}
 12%|██████████████████████▌                                                                                                                                                                         | 668/5680 [1:47:02<20:47:04, 14.93s/it] 12%|██████████████████████▌                                                                                                                                                                         | 669/5680 [1:47:15<19:55:15, 14.31s/it]                                                                                                                                                                                                                                             {'loss': '0.4741', 'grad_norm': '0.1863', 'learning_rate': '0.0001933', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636', 'tokens/total': 5480448, 'tokens/trainable': 5419749, 'epoch': '1.002'}
 12%|██████████████████████▌                                                                                                                                                                         | 669/5680 [1:47:15<19:55:15, 14.31s/it] 12%|██████████████████████▋                                                                                                                                                                         | 670/5680 [1:47:28<19:17:19, 13.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7686', 'grad_norm': '0.2228', 'learning_rate': '0.0001932', 'ppl': '2.157', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '638.3', 'tokens/total': 5488640, 'tokens/trainable': 5427915, 'epoch': '1.002'}
 12%|██████████████████████▋                                                                                                                                                                         | 670/5680 [1:47:28<19:17:19, 13.86s/it] 12%|██████████████████████▋                                                                                                                                                                         | 671/5680 [1:47:41<19:17:28, 13.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7848', 'grad_norm': '0.2145', 'learning_rate': '0.0001932', 'ppl': '2.192', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588.1', 'tokens/total': 5496832, 'tokens/trainable': 5436067, 'epoch': '1.002'}
 12%|██████████████████████▋                                                                                                                                                                         | 671/5680 [1:47:41<19:17:28, 13.86s/it] 12%|██████████████████████▋                                                                                                                                                                         | 672/5680 [1:47:55<19:17:32, 13.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7076', 'grad_norm': '0.2295', 'learning_rate': '0.0001932', 'ppl': '2.029', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.2', 'tokens/total': 5505024, 'tokens/trainable': 5444180, 'epoch': '1.002'}
 12%|██████████████████████▋                                                                                                                                                                         | 672/5680 [1:47:55<19:17:32, 13.87s/it] 12%|██████████████████████▋                                                                                                                                                                         | 673/5680 [1:48:09<19:21:55, 13.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6537', 'grad_norm': '0.2529', 'learning_rate': '0.0001932', 'ppl': '1.923', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '580', 'tokens/total': 5513216, 'tokens/trainable': 5452326, 'epoch': '1.002'}
 12%|██████████████████████▋                                                                                                                                                                         | 673/5680 [1:48:09<19:21:55, 13.92s/it] 12%|██████████████████████▊                                                                                                                                                                         | 674/5680 [1:48:23<19:13:32, 13.83s/it]                                                                                                                                                                                                                                             {'loss': '1.101', 'grad_norm': '0.2547', 'learning_rate': '0.0001932', 'ppl': '3.007', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.2', 'tokens/total': 5521408, 'tokens/trainable': 5460467, 'epoch': '1.003'}
 12%|██████████████████████▊                                                                                                                                                                         | 674/5680 [1:48:23<19:13:32, 13.83s/it] 12%|██████████████████████▊                                                                                                                                                                         | 675/5680 [1:48:36<18:52:08, 13.57s/it]                                                                                                                                                                                                                                             {'loss': '0.9107', 'grad_norm': '0.2342', 'learning_rate': '0.0001931', 'ppl': '2.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625', 'tokens/total': 5529600, 'tokens/trainable': 5468572, 'epoch': '1.003'}
 12%|██████████████████████▊                                                                                                                                                                         | 675/5680 [1:48:36<18:52:08, 13.57s/it] 12%|██████████████████████▊                                                                                                                                                                         | 676/5680 [1:48:49<18:35:23, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.9227', 'grad_norm': '0.24', 'learning_rate': '0.0001931', 'ppl': '2.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.9', 'tokens/total': 5537792, 'tokens/trainable': 5476719, 'epoch': '1.003'}
 12%|██████████████████████▊                                                                                                                                                                         | 676/5680 [1:48:49<18:35:23, 13.37s/it] 12%|██████████████████████▉                                                                                                                                                                         | 677/5680 [1:49:02<18:22:22, 13.22s/it]                                                                                                                                                                                                                                             {'loss': '0.9108', 'grad_norm': '0.251', 'learning_rate': '0.0001931', 'ppl': '2.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '633', 'tokens/total': 5545984, 'tokens/trainable': 5484857, 'epoch': '1.003'}
 12%|██████████████████████▉                                                                                                                                                                         | 677/5680 [1:49:02<18:22:22, 13.22s/it] 12%|██████████████████████▉                                                                                                                                                                         | 678/5680 [1:49:15<18:30:53, 13.33s/it]                                                                                                                                                                                                                                             {'loss': '0.7029', 'grad_norm': '0.223', 'learning_rate': '0.0001931', 'ppl': '2.02', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.1', 'tokens/total': 5554176, 'tokens/trainable': 5492982, 'epoch': '1.003'}
 12%|██████████████████████▉                                                                                                                                                                         | 678/5680 [1:49:15<18:30:53, 13.33s/it] 12%|██████████████████████▉                                                                                                                                                                         | 679/5680 [1:49:29<18:43:52, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.7406', 'grad_norm': '0.2382', 'learning_rate': '0.0001931', 'ppl': '2.097', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.8', 'tokens/total': 5562368, 'tokens/trainable': 5501150, 'epoch': '1.004'}
 12%|██████████████████████▉                                                                                                                                                                         | 679/5680 [1:49:29<18:43:52, 13.48s/it] 12%|██████████████████████▉                                                                                                                                                                         | 680/5680 [1:49:43<18:55:16, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.4851', 'grad_norm': '0.189', 'learning_rate': '0.000193', 'ppl': '1.624', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '584.2', 'tokens/total': 5570560, 'tokens/trainable': 5509292, 'epoch': '1.004'}
 12%|██████████████████████▉                                                                                                                                                                         | 680/5680 [1:49:43<18:55:16, 13.62s/it] 12%|███████████████████████                                                                                                                                                                         | 681/5680 [1:49:57<18:55:01, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.8071', 'grad_norm': '0.2309', 'learning_rate': '0.000193', 'ppl': '2.241', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599', 'tokens/total': 5578752, 'tokens/trainable': 5517447, 'epoch': '1.004'}
 12%|███████████████████████                                                                                                                                                                         | 681/5680 [1:49:57<18:55:01, 13.62s/it] 12%|███████████████████████                                                                                                                                                                         | 682/5680 [1:50:10<18:37:58, 13.42s/it]                                                                                                                                                                                                                                             {'loss': '0.6669', 'grad_norm': '0.2135', 'learning_rate': '0.000193', 'ppl': '1.948', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.2', 'tokens/total': 5586944, 'tokens/trainable': 5525600, 'epoch': '1.004'}
 12%|███████████████████████                                                                                                                                                                         | 682/5680 [1:50:10<18:37:58, 13.42s/it] 12%|███████████████████████                                                                                                                                                                         | 683/5680 [1:50:23<18:31:25, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.8601', 'grad_norm': '0.2252', 'learning_rate': '0.000193', 'ppl': '2.363', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '621.6', 'tokens/total': 5595136, 'tokens/trainable': 5533777, 'epoch': '1.004'}
 12%|███████████████████████                                                                                                                                                                         | 683/5680 [1:50:23<18:31:25, 13.35s/it] 12%|███████████████████████                                                                                                                                                                         | 684/5680 [1:50:36<18:21:56, 13.23s/it]                                                                                                                                                                                                                                             {'loss': '0.8379', 'grad_norm': '0.2379', 'learning_rate': '0.0001929', 'ppl': '2.312', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.1', 'tokens/total': 5603328, 'tokens/trainable': 5541944, 'epoch': '1.004'}
 12%|███████████████████████                                                                                                                                                                         | 684/5680 [1:50:36<18:21:56, 13.23s/it] 12%|███████████████████████▏                                                                                                                                                                        | 685/5680 [1:50:49<18:23:33, 13.26s/it]                                                                                                                                                                                                                                             {'loss': '0.7682', 'grad_norm': '0.2511', 'learning_rate': '0.0001929', 'ppl': '2.156', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '611.7', 'tokens/total': 5611520, 'tokens/trainable': 5550074, 'epoch': '1.005'}
 12%|███████████████████████▏                                                                                                                                                                        | 685/5680 [1:50:49<18:23:33, 13.26s/it] 12%|███████████████████████▏                                                                                                                                                                        | 686/5680 [1:51:03<18:37:20, 13.42s/it]                                                                                                                                                                                                                                             {'loss': '0.8544', 'grad_norm': '0.2195', 'learning_rate': '0.0001929', 'ppl': '2.35', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.1', 'tokens/total': 5619712, 'tokens/trainable': 5558222, 'epoch': '1.005'}
 12%|███████████████████████▏                                                                                                                                                                        | 686/5680 [1:51:03<18:37:20, 13.42s/it] 12%|███████████████████████▏                                                                                                                                                                        | 687/5680 [1:51:17<18:44:51, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.9557', 'grad_norm': '0.2613', 'learning_rate': '0.0001929', 'ppl': '2.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596.1', 'tokens/total': 5627904, 'tokens/trainable': 5566398, 'epoch': '1.005'}
 12%|███████████████████████▏                                                                                                                                                                        | 687/5680 [1:51:17<18:44:51, 13.52s/it] 12%|███████████████████████▎                                                                                                                                                                        | 688/5680 [1:51:31<18:59:45, 13.70s/it]                                                                                                                                                                                                                                             {'loss': '0.7583', 'grad_norm': '0.2197', 'learning_rate': '0.0001929', 'ppl': '2.135', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '578.5', 'tokens/total': 5636096, 'tokens/trainable': 5574563, 'epoch': '1.005'}
 12%|███████████████████████▎                                                                                                                                                                        | 688/5680 [1:51:31<18:59:45, 13.70s/it] 12%|███████████████████████▎                                                                                                                                                                        | 689/5680 [1:51:44<18:45:01, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.5524', 'grad_norm': '0.2495', 'learning_rate': '0.0001928', 'ppl': '1.737', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '619.8', 'tokens/total': 5644288, 'tokens/trainable': 5582689, 'epoch': '1.005'}
 12%|███████████████████████▎                                                                                                                                                                        | 689/5680 [1:51:44<18:45:01, 13.52s/it] 12%|███████████████████████▎                                                                                                                                                                        | 690/5680 [1:51:57<18:33:10, 13.38s/it]                                                                                                                                                                                                                                             {'loss': '0.5852', 'grad_norm': '0.196', 'learning_rate': '0.0001928', 'ppl': '1.795', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '626.5', 'tokens/total': 5652480, 'tokens/trainable': 5590861, 'epoch': '1.005'}
 12%|███████████████████████▎                                                                                                                                                                        | 690/5680 [1:51:57<18:33:10, 13.38s/it] 12%|███████████████████████▎                                                                                                                                                                        | 691/5680 [1:52:10<18:15:29, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '0.662', 'grad_norm': '0.2355', 'learning_rate': '0.0001928', 'ppl': '1.939', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.1', 'tokens/total': 5660672, 'tokens/trainable': 5599039, 'epoch': '1.006'}
 12%|███████████████████████▎                                                                                                                                                                        | 691/5680 [1:52:10<18:15:29, 13.17s/it] 12%|███████████████████████▍                                                                                                                                                                        | 692/5680 [1:52:23<18:16:43, 13.19s/it]                                                                                                                                                                                                                                             {'loss': '0.4315', 'grad_norm': '0.1936', 'learning_rate': '0.0001928', 'ppl': '1.54', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '617.4', 'tokens/total': 5668864, 'tokens/trainable': 5607200, 'epoch': '1.006'}
 12%|███████████████████████▍                                                                                                                                                                        | 692/5680 [1:52:23<18:16:43, 13.19s/it] 12%|███████████████████████▍                                                                                                                                                                        | 693/5680 [1:52:37<18:29:16, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.475', 'grad_norm': '0.2034', 'learning_rate': '0.0001928', 'ppl': '1.608', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596.1', 'tokens/total': 5677056, 'tokens/trainable': 5615365, 'epoch': '1.006'}
 12%|███████████████████████▍                                                                                                                                                                        | 693/5680 [1:52:37<18:29:16, 13.35s/it] 12%|███████████████████████▍                                                                                                                                                                        | 694/5680 [1:52:50<18:41:03, 13.49s/it]                                                                                                                                                                                                                                             {'loss': '0.7461', 'grad_norm': '0.231', 'learning_rate': '0.0001927', 'ppl': '2.109', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588', 'tokens/total': 5685248, 'tokens/trainable': 5623489, 'epoch': '1.006'}
 12%|███████████████████████▍                                                                                                                                                                        | 694/5680 [1:52:50<18:41:03, 13.49s/it] 12%|███████████████████████▍                                                                                                                                                                        | 695/5680 [1:53:04<18:46:54, 13.56s/it]                                                                                                                                                                                                                                             {'loss': '0.8961', 'grad_norm': '0.243', 'learning_rate': '0.0001927', 'ppl': '2.45', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595.8', 'tokens/total': 5693440, 'tokens/trainable': 5631667, 'epoch': '1.006'}
 12%|███████████████████████▍                                                                                                                                                                        | 695/5680 [1:53:04<18:46:54, 13.56s/it] 12%|███████████████████████▌                                                                                                                                                                        | 696/5680 [1:53:17<18:38:32, 13.47s/it]                                                                                                                                                                                                                                             {'loss': '0.7142', 'grad_norm': '0.2134', 'learning_rate': '0.0001927', 'ppl': '2.042', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '617.7', 'tokens/total': 5701632, 'tokens/trainable': 5639838, 'epoch': '1.007'}
 12%|███████████████████████▌                                                                                                                                                                        | 696/5680 [1:53:17<18:38:32, 13.47s/it] 12%|███████████████████████▌                                                                                                                                                                        | 697/5680 [1:53:30<18:23:23, 13.29s/it]                                                                                                                                                                                                                                             {'loss': '0.7618', 'grad_norm': '0.2156', 'learning_rate': '0.0001927', 'ppl': '2.142', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '635.1', 'tokens/total': 5709824, 'tokens/trainable': 5648004, 'epoch': '1.007'}
 12%|███████████████████████▌                                                                                                                                                                        | 697/5680 [1:53:30<18:23:23, 13.29s/it] 12%|███████████████████████▌                                                                                                                                                                        | 698/5680 [1:53:43<18:13:25, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '0.5475', 'grad_norm': '0.1807', 'learning_rate': '0.0001927', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.2', 'tokens/total': 5718016, 'tokens/trainable': 5656178, 'epoch': '1.007'}
 12%|███████████████████████▌                                                                                                                                                                        | 698/5680 [1:53:43<18:13:25, 13.17s/it] 12%|███████████████████████▋                                                                                                                                                                        | 699/5680 [1:53:56<18:13:54, 13.18s/it]                                                                                                                                                                                                                                             {'loss': '0.7974', 'grad_norm': '0.246', 'learning_rate': '0.0001926', 'ppl': '2.22', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '617', 'tokens/total': 5726208, 'tokens/trainable': 5664304, 'epoch': '1.007'}
 12%|███████████████████████▋                                                                                                                                                                        | 699/5680 [1:53:56<18:13:54, 13.18s/it] 12%|███████████████████████▋                                                                                                                                                                        | 700/5680 [1:54:10<18:33:51, 13.42s/it]                                                                                                                                                                                                                                             {'loss': '0.6007', 'grad_norm': '0.2153', 'learning_rate': '0.0001926', 'ppl': '1.823', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '582.5', 'tokens/total': 5734400, 'tokens/trainable': 5672443, 'epoch': '1.007'}
 12%|███████████████████████▋                                                                                                                                                                        | 700/5680 [1:54:10<18:33:51, 13.42s/it] 12%|███████████████████████▋                                                                                                                                                                        | 701/5680 [1:54:24<18:46:35, 13.58s/it]                                                                                                                                                                                                                                             {'loss': '0.6426', 'grad_norm': '0.231', 'learning_rate': '0.0001926', 'ppl': '1.901', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.6', 'tokens/total': 5742592, 'tokens/trainable': 5680601, 'epoch': '1.007'}
 12%|███████████████████████▋                                                                                                                                                                        | 701/5680 [1:54:24<18:46:35, 13.58s/it] 12%|███████████████████████▋                                                                                                                                                                        | 702/5680 [1:54:38<18:50:10, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.6694', 'grad_norm': '0.2083', 'learning_rate': '0.0001926', 'ppl': '1.953', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595', 'tokens/total': 5750784, 'tokens/trainable': 5688763, 'epoch': '1.008'}
 12%|███████████████████████▋                                                                                                                                                                        | 702/5680 [1:54:38<18:50:10, 13.62s/it] 12%|███████████████████████▊                                                                                                                                                                        | 703/5680 [1:54:52<18:54:09, 13.67s/it]                                                                                                                                                                                                                                             {'loss': '0.7768', 'grad_norm': '0.2795', 'learning_rate': '0.0001926', 'ppl': '2.175', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.4', 'tokens/total': 5758976, 'tokens/trainable': 5696915, 'epoch': '1.008'}
 12%|███████████████████████▊                                                                                                                                                                        | 703/5680 [1:54:52<18:54:09, 13.67s/it] 12%|███████████████████████▊                                                                                                                                                                        | 704/5680 [1:55:05<18:32:15, 13.41s/it]                                                                                                                                                                                                                                             {'loss': '0.5618', 'grad_norm': '0.1937', 'learning_rate': '0.0001925', 'ppl': '1.754', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '638.3', 'tokens/total': 5767168, 'tokens/trainable': 5705079, 'epoch': '1.008'}
 12%|███████████████████████▊                                                                                                                                                                        | 704/5680 [1:55:05<18:32:15, 13.41s/it] 12%|███████████████████████▊                                                                                                                                                                        | 705/5680 [1:55:18<18:27:47, 13.36s/it]                                                                                                                                                                                                                                             {'loss': '0.9409', 'grad_norm': '0.2436', 'learning_rate': '0.0001925', 'ppl': '2.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '617.6', 'tokens/total': 5775360, 'tokens/trainable': 5713248, 'epoch': '1.008'}
 12%|███████████████████████▊                                                                                                                                                                        | 705/5680 [1:55:18<18:27:47, 13.36s/it] 12%|███████████████████████▊                                                                                                                                                                        | 706/5680 [1:55:31<18:19:01, 13.26s/it]                                                                                                                                                                                                                                             {'loss': '1.08', 'grad_norm': '0.2534', 'learning_rate': '0.0001925', 'ppl': '2.944', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.2', 'tokens/total': 5783552, 'tokens/trainable': 5721434, 'epoch': '1.008'}
 12%|███████████████████████▊                                                                                                                                                                        | 706/5680 [1:55:31<18:19:01, 13.26s/it] 12%|███████████████████████▉                                                                                                                                                                        | 707/5680 [1:55:44<18:17:10, 13.24s/it]                                                                                                                                                                                                                                             {'loss': '0.8468', 'grad_norm': '0.2342', 'learning_rate': '0.0001925', 'ppl': '2.332', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '621.3', 'tokens/total': 5791744, 'tokens/trainable': 5729623, 'epoch': '1.008'}
 12%|███████████████████████▉                                                                                                                                                                        | 707/5680 [1:55:44<18:17:10, 13.24s/it] 12%|███████████████████████▉                                                                                                                                                                        | 708/5680 [1:55:58<18:27:50, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.6174', 'grad_norm': '0.2186', 'learning_rate': '0.0001925', 'ppl': '1.854', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599', 'tokens/total': 5799936, 'tokens/trainable': 5737810, 'epoch': '1.009'}
 12%|███████████████████████▉                                                                                                                                                                        | 708/5680 [1:55:58<18:27:50, 13.37s/it] 12%|███████████████████████▉                                                                                                                                                                        | 709/5680 [1:56:12<18:43:49, 13.56s/it]                                                                                                                                                                                                                                             {'loss': '0.5965', 'grad_norm': '0.2543', 'learning_rate': '0.0001924', 'ppl': '1.816', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '581.7', 'tokens/total': 5808128, 'tokens/trainable': 5745965, 'epoch': '1.009'}
 12%|███████████████████████▉                                                                                                                                                                        | 709/5680 [1:56:12<18:43:49, 13.56s/it] 12%|████████████████████████                                                                                                                                                                        | 710/5680 [1:56:26<18:50:52, 13.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5338', 'grad_norm': '0.2044', 'learning_rate': '0.0001924', 'ppl': '1.705', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '587.5', 'tokens/total': 5816320, 'tokens/trainable': 5754104, 'epoch': '1.009'}
 12%|████████████████████████                                                                                                                                                                        | 710/5680 [1:56:26<18:50:52, 13.65s/it] 13%|████████████████████████                                                                                                                                                                        | 711/5680 [1:56:39<18:48:01, 13.62s/it]                                                                                                                                                                                                                                             {'loss': '0.7323', 'grad_norm': '0.2439', 'learning_rate': '0.0001924', 'ppl': '2.08', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598.2', 'tokens/total': 5824512, 'tokens/trainable': 5762200, 'epoch': '1.009'}
 13%|████████████████████████                                                                                                                                                                        | 711/5680 [1:56:39<18:48:01, 13.62s/it] 13%|████████████████████████                                                                                                                                                                        | 712/5680 [1:56:52<18:34:13, 13.46s/it]                                                                                                                                                                                                                                             {'loss': '0.6463', 'grad_norm': '0.2121', 'learning_rate': '0.0001924', 'ppl': '1.909', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625.2', 'tokens/total': 5832704, 'tokens/trainable': 5770369, 'epoch': '1.009'}
 13%|████████████████████████                                                                                                                                                                        | 712/5680 [1:56:52<18:34:13, 13.46s/it] 13%|████████████████████████                                                                                                                                                                        | 713/5680 [1:57:05<18:22:43, 13.32s/it]                                                                                                                                                                                                                                             {'loss': '0.8343', 'grad_norm': '0.2444', 'learning_rate': '0.0001923', 'ppl': '2.303', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '623.4', 'tokens/total': 5840896, 'tokens/trainable': 5778471, 'epoch': '1.01'}
 13%|████████████████████████                                                                                                                                                                        | 713/5680 [1:57:05<18:22:43, 13.32s/it] 13%|████████████████████████▏                                                                                                                                                                       | 714/5680 [1:57:18<18:11:56, 13.19s/it]                                                                                                                                                                                                                                             {'loss': '0.8294', 'grad_norm': '0.2464', 'learning_rate': '0.0001923', 'ppl': '2.292', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.9', 'tokens/total': 5849088, 'tokens/trainable': 5786591, 'epoch': '1.01'}
 13%|████████████████████████▏                                                                                                                                                                       | 714/5680 [1:57:18<18:11:56, 13.19s/it] 13%|████████████████████████▏                                                                                                                                                                       | 715/5680 [1:57:31<18:15:51, 13.24s/it]                                                                                                                                                                                                                                             {'loss': '0.7383', 'grad_norm': '0.2208', 'learning_rate': '0.0001923', 'ppl': '2.092', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '611.1', 'tokens/total': 5857280, 'tokens/trainable': 5794748, 'epoch': '1.01'}
 13%|████████████████████████▏                                                                                                                                                                       | 715/5680 [1:57:31<18:15:51, 13.24s/it] 13%|████████████████████████▏                                                                                                                                                                       | 716/5680 [1:57:45<18:26:59, 13.38s/it]                                                                                                                                                                                                                                             {'loss': '0.8656', 'grad_norm': '0.2388', 'learning_rate': '0.0001923', 'ppl': '2.376', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595.3', 'tokens/total': 5865472, 'tokens/trainable': 5802901, 'epoch': '1.01'}
 13%|████████████████████████▏                                                                                                                                                                       | 716/5680 [1:57:45<18:26:59, 13.38s/it] 13%|████████████████████████▏                                                                                                                                                                       | 717/5680 [1:57:59<18:36:46, 13.50s/it]                                                                                                                                                                                                                                             {'loss': '0.6299', 'grad_norm': '0.238', 'learning_rate': '0.0001923', 'ppl': '1.877', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '594.5', 'tokens/total': 5873664, 'tokens/trainable': 5811090, 'epoch': '1.01'}
 13%|████████████████████████▏                                                                                                                                                                       | 717/5680 [1:57:59<18:36:46, 13.50s/it] 13%|████████████████████████▎                                                                                                                                                                       | 718/5680 [1:58:12<18:35:04, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.6802', 'grad_norm': '0.2062', 'learning_rate': '0.0001922', 'ppl': '1.974', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '604.9', 'tokens/total': 5881856, 'tokens/trainable': 5819218, 'epoch': '1.01'}
 13%|████████████████████████▎                                                                                                                                                                       | 718/5680 [1:58:12<18:35:04, 13.48s/it] 13%|████████████████████████▎                                                                                                                                                                       | 719/5680 [1:58:26<18:35:50, 13.50s/it]                                                                                                                                                                                                                                             {'loss': '0.5119', 'grad_norm': '0.2211', 'learning_rate': '0.0001922', 'ppl': '1.669', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '601.3', 'tokens/total': 5890048, 'tokens/trainable': 5827347, 'epoch': '1.011'}
 13%|████████████████████████▎                                                                                                                                                                       | 719/5680 [1:58:26<18:35:50, 13.50s/it] 13%|████████████████████████▎                                                                                                                                                                       | 720/5680 [1:58:39<18:30:52, 13.44s/it]                                                                                                                                                                                                                                             {'loss': '0.7942', 'grad_norm': '0.2231', 'learning_rate': '0.0001922', 'ppl': '2.213', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '613.6', 'tokens/total': 5898240, 'tokens/trainable': 5835509, 'epoch': '1.011'}
 13%|████████████████████████▎                                                                                                                                                                       | 720/5680 [1:58:39<18:30:52, 13.44s/it] 13%|████████████████████████▎                                                                                                                                                                       | 721/5680 [1:58:52<18:21:44, 13.33s/it]                                                                                                                                                                                                                                             {'loss': '0.5465', 'grad_norm': '0.1964', 'learning_rate': '0.0001922', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625', 'tokens/total': 5906432, 'tokens/trainable': 5843681, 'epoch': '1.011'}
 13%|████████████████████████▎                                                                                                                                                                       | 721/5680 [1:58:52<18:21:44, 13.33s/it] 13%|████████████████████████▍                                                                                                                                                                       | 722/5680 [1:59:05<18:11:45, 13.21s/it]                                                                                                                                                                                                                                             {'loss': '0.5205', 'grad_norm': '0.2037', 'learning_rate': '0.0001922', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.7', 'tokens/total': 5914624, 'tokens/trainable': 5851824, 'epoch': '1.011'}
 13%|████████████████████████▍                                                                                                                                                                       | 722/5680 [1:59:05<18:11:45, 13.21s/it] 13%|████████████████████████▍                                                                                                                                                                       | 723/5680 [1:59:18<18:03:32, 13.12s/it]                                                                                                                                                                                                                                             {'loss': '0.6458', 'grad_norm': '0.2042', 'learning_rate': '0.0001921', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636', 'tokens/total': 5922816, 'tokens/trainable': 5860008, 'epoch': '1.011'}
 13%|████████████████████████▍                                                                                                                                                                       | 723/5680 [1:59:18<18:03:32, 13.12s/it] 13%|████████████████████████▍                                                                                                                                                                       | 724/5680 [1:59:32<18:20:52, 13.33s/it]                                                                                                                                                                                                                                             {'loss': '0.7044', 'grad_norm': '0.2253', 'learning_rate': '0.0001921', 'ppl': '2.023', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.3', 'tokens/total': 5931008, 'tokens/trainable': 5868164, 'epoch': '1.011'}
 13%|████████████████████████▍                                                                                                                                                                       | 724/5680 [1:59:32<18:20:52, 13.33s/it] 13%|████████████████████████▌                                                                                                                                                                       | 725/5680 [1:59:46<18:32:21, 13.47s/it]                                                                                                                                                                                                                                             {'loss': '0.7185', 'grad_norm': '0.2457', 'learning_rate': '0.0001921', 'ppl': '2.051', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.7', 'tokens/total': 5939200, 'tokens/trainable': 5876298, 'epoch': '1.012'}
 13%|████████████████████████▌                                                                                                                                                                       | 725/5680 [1:59:46<18:32:21, 13.47s/it] 13%|████████████████████████▌                                                                                                                                                                       | 726/5680 [1:59:59<18:35:32, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.8907', 'grad_norm': '0.2671', 'learning_rate': '0.0001921', 'ppl': '2.437', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600.7', 'tokens/total': 5947392, 'tokens/trainable': 5884467, 'epoch': '1.012'}
 13%|████████████████████████▌                                                                                                                                                                       | 726/5680 [1:59:59<18:35:32, 13.51s/it] 13%|████████████████████████▌                                                                                                                                                                       | 727/5680 [2:00:13<18:40:18, 13.57s/it]                                                                                                                                                                                                                                             {'loss': '1.019', 'grad_norm': '0.2704', 'learning_rate': '0.000192', 'ppl': '2.77', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595', 'tokens/total': 5955584, 'tokens/trainable': 5892621, 'epoch': '1.012'}
 13%|████████████████████████▌                                                                                                                                                                       | 727/5680 [2:00:13<18:40:18, 13.57s/it] 13%|████████████████████████▌                                                                                                                                                                       | 728/5680 [2:00:26<18:22:49, 13.36s/it]                                                                                                                                                                                                                                             {'loss': '0.7282', 'grad_norm': '0.2111', 'learning_rate': '0.000192', 'ppl': '2.071', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.4', 'tokens/total': 5963776, 'tokens/trainable': 5900757, 'epoch': '1.012'}
 13%|████████████████████████▌                                                                                                                                                                       | 728/5680 [2:00:26<18:22:49, 13.36s/it] 13%|████████████████████████▋                                                                                                                                                                       | 729/5680 [2:00:39<18:10:36, 13.22s/it]                                                                                                                                                                                                                                             {'loss': '0.49', 'grad_norm': '0.1871', 'learning_rate': '0.000192', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.5', 'tokens/total': 5971968, 'tokens/trainable': 5908948, 'epoch': '1.012'}
 13%|████████████████████████▋                                                                                                                                                                       | 729/5680 [2:00:39<18:10:36, 13.22s/it] 13%|████████████████████████▋                                                                                                                                                                       | 730/5680 [2:00:52<18:02:21, 13.12s/it]                                                                                                                                                                                                                                             {'loss': '0.8423', 'grad_norm': '0.2223', 'learning_rate': '0.000192', 'ppl': '2.322', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '633.3', 'tokens/total': 5980160, 'tokens/trainable': 5917105, 'epoch': '1.012'}
 13%|████████████████████████▋                                                                                                                                                                       | 730/5680 [2:00:52<18:02:21, 13.12s/it] 13%|████████████████████████▋                                                                                                                                                                       | 731/5680 [2:01:05<18:00:49, 13.10s/it]                                                                                                                                                                                                                                             {'loss': '0.7635', 'grad_norm': '0.2338', 'learning_rate': '0.000192', 'ppl': '2.146', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.1', 'tokens/total': 5988352, 'tokens/trainable': 5925226, 'epoch': '1.013'}
 13%|████████████████████████▋                                                                                                                                                                       | 731/5680 [2:01:05<18:00:49, 13.10s/it] 13%|████████████████████████▋                                                                                                                                                                       | 732/5680 [2:01:18<18:16:21, 13.29s/it]                                                                                                                                                                                                                                             {'loss': '0.6316', 'grad_norm': '0.2133', 'learning_rate': '0.0001919', 'ppl': '1.881', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '594.4', 'tokens/total': 5996544, 'tokens/trainable': 5933392, 'epoch': '1.013'}
 13%|████████████████████████▋                                                                                                                                                                       | 732/5680 [2:01:18<18:16:21, 13.29s/it] 13%|████████████████████████▊                                                                                                                                                                       | 733/5680 [2:01:32<18:27:49, 13.44s/it]                                                                                                                                                                                                                                             {'loss': '0.8285', 'grad_norm': '0.2272', 'learning_rate': '0.0001919', 'ppl': '2.29', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.6', 'tokens/total': 6004736, 'tokens/trainable': 5941506, 'epoch': '1.013'}
 13%|████████████████████████▊                                                                                                                                                                       | 733/5680 [2:01:32<18:27:49, 13.44s/it] 13%|████████████████████████▊                                                                                                                                                                       | 734/5680 [2:01:46<18:33:06, 13.50s/it]                                                                                                                                                                                                                                             {'loss': '0.8109', 'grad_norm': '0.2514', 'learning_rate': '0.0001919', 'ppl': '2.25', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '597.4', 'tokens/total': 6012928, 'tokens/trainable': 5949662, 'epoch': '1.013'}
 13%|████████████████████████▊                                                                                                                                                                       | 734/5680 [2:01:46<18:33:06, 13.50s/it] 13%|████████████████████████▊                                                                                                                                                                       | 735/5680 [2:02:00<18:36:04, 13.54s/it]                                                                                                                                                                                                                                             {'loss': '0.8724', 'grad_norm': '0.247', 'learning_rate': '0.0001919', 'ppl': '2.393', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595', 'tokens/total': 6021120, 'tokens/trainable': 5957770, 'epoch': '1.013'}
 13%|████████████████████████▊                                                                                                                                                                       | 735/5680 [2:02:00<18:36:04, 13.54s/it] 13%|████████████████████████▉                                                                                                                                                                       | 736/5680 [2:02:13<18:27:19, 13.44s/it]                                                                                                                                                                                                                                             {'loss': '0.8027', 'grad_norm': '0.2489', 'learning_rate': '0.0001918', 'ppl': '2.232', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '618.9', 'tokens/total': 6029312, 'tokens/trainable': 5965933, 'epoch': '1.014'}
 13%|████████████████████████▉                                                                                                                                                                       | 736/5680 [2:02:13<18:27:19, 13.44s/it] 13%|████████████████████████▉                                                                                                                                                                       | 737/5680 [2:02:26<18:15:48, 13.30s/it]                                                                                                                                                                                                                                             {'loss': '0.7704', 'grad_norm': '0.2651', 'learning_rate': '0.0001918', 'ppl': '2.161', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.2', 'tokens/total': 6037504, 'tokens/trainable': 5974120, 'epoch': '1.014'}
 13%|████████████████████████▉                                                                                                                                                                       | 737/5680 [2:02:26<18:15:48, 13.30s/it] 13%|████████████████████████▉                                                                                                                                                                       | 738/5680 [2:02:39<18:03:57, 13.16s/it]                                                                                                                                                                                                                                             {'loss': '0.8827', 'grad_norm': '0.2909', 'learning_rate': '0.0001918', 'ppl': '2.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.3', 'tokens/total': 6045696, 'tokens/trainable': 5982269, 'epoch': '1.014'}
 13%|████████████████████████▉                                                                                                                                                                       | 738/5680 [2:02:39<18:03:57, 13.16s/it] 13%|████████████████████████▉                                                                                                                                                                       | 739/5680 [2:02:51<17:57:33, 13.09s/it]                                                                                                                                                                                                                                             {'loss': '0.7513', 'grad_norm': '0.2344', 'learning_rate': '0.0001918', 'ppl': '2.12', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.6', 'tokens/total': 6053888, 'tokens/trainable': 5990416, 'epoch': '1.014'}
 13%|████████████████████████▉                                                                                                                                                                       | 739/5680 [2:02:51<17:57:33, 13.09s/it] 13%|█████████████████████████                                                                                                                                                                       | 740/5680 [2:03:05<18:11:57, 13.26s/it]                                                                                                                                                                                                                                             {'loss': '0.8806', 'grad_norm': '0.2406', 'learning_rate': '0.0001918', 'ppl': '2.412', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596.4', 'tokens/total': 6062080, 'tokens/trainable': 5998567, 'epoch': '1.014'}
 13%|█████████████████████████                                                                                                                                                                       | 740/5680 [2:03:05<18:11:57, 13.26s/it] 13%|█████████████████████████                                                                                                                                                                       | 741/5680 [2:03:19<18:25:23, 13.43s/it]                                                                                                                                                                                                                                             {'loss': '0.715', 'grad_norm': '0.2475', 'learning_rate': '0.0001917', 'ppl': '2.044', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.7', 'tokens/total': 6070272, 'tokens/trainable': 6006711, 'epoch': '1.014'}
 13%|█████████████████████████                                                                                                                                                                       | 741/5680 [2:03:19<18:25:23, 13.43s/it] 13%|█████████████████████████                                                                                                                                                                       | 742/5680 [2:03:33<18:30:49, 13.50s/it]                                                                                                                                                                                                                                             {'loss': '0.6506', 'grad_norm': '0.2198', 'learning_rate': '0.0001917', 'ppl': '1.917', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595.3', 'tokens/total': 6078464, 'tokens/trainable': 6014836, 'epoch': '1.015'}
 13%|█████████████████████████                                                                                                                                                                       | 742/5680 [2:03:33<18:30:49, 13.50s/it] 13%|█████████████████████████                                                                                                                                                                       | 743/5680 [2:03:46<18:38:14, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.9004', 'grad_norm': '0.2194', 'learning_rate': '0.0001917', 'ppl': '2.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.3', 'tokens/total': 6086656, 'tokens/trainable': 6022997, 'epoch': '1.015'}
 13%|█████████████████████████                                                                                                                                                                       | 743/5680 [2:03:46<18:38:14, 13.59s/it] 13%|█████████████████████████▏                                                                                                                                                                      | 744/5680 [2:04:00<18:26:38, 13.45s/it]                                                                                                                                                                                                                                             {'loss': '0.4546', 'grad_norm': '0.1973', 'learning_rate': '0.0001917', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '621.3', 'tokens/total': 6094848, 'tokens/trainable': 6031148, 'epoch': '1.015'}
 13%|█████████████████████████▏                                                                                                                                                                      | 744/5680 [2:04:00<18:26:38, 13.45s/it] 13%|█████████████████████████▏                                                                                                                                                                      | 745/5680 [2:04:13<18:15:56, 13.32s/it]                                                                                                                                                                                                                                             {'loss': '0.6126', 'grad_norm': '0.2098', 'learning_rate': '0.0001917', 'ppl': '1.845', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '626.2', 'tokens/total': 6103040, 'tokens/trainable': 6039299, 'epoch': '1.015'}
 13%|█████████████████████████▏                                                                                                                                                                      | 745/5680 [2:04:13<18:15:56, 13.32s/it] 13%|█████████████████████████▏                                                                                                                                                                      | 746/5680 [2:04:26<18:09:29, 13.25s/it]                                                                                                                                                                                                                                             {'loss': '0.6973', 'grad_norm': '0.2065', 'learning_rate': '0.0001916', 'ppl': '2.008', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.9', 'tokens/total': 6111232, 'tokens/trainable': 6047440, 'epoch': '1.015'}
 13%|█████████████████████████▏                                                                                                                                                                      | 746/5680 [2:04:26<18:09:29, 13.25s/it] 13%|█████████████████████████▎                                                                                                                                                                      | 747/5680 [2:04:39<18:00:01, 13.14s/it]                                                                                                                                                                                                                                             {'loss': '0.5698', 'grad_norm': '0.1987', 'learning_rate': '0.0001916', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.4', 'tokens/total': 6119424, 'tokens/trainable': 6055549, 'epoch': '1.015'}
 13%|█████████████████████████▎                                                                                                                                                                      | 747/5680 [2:04:39<18:00:01, 13.14s/it] 13%|█████████████████████████▎                                                                                                                                                                      | 748/5680 [2:04:52<18:09:01, 13.25s/it]                                                                                                                                                                                                                                             {'loss': '0.6698', 'grad_norm': '0.2359', 'learning_rate': '0.0001916', 'ppl': '1.954', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '602.6', 'tokens/total': 6127616, 'tokens/trainable': 6063687, 'epoch': '1.016'}
 13%|█████████████████████████▎                                                                                                                                                                      | 748/5680 [2:04:52<18:09:01, 13.25s/it] 13%|█████████████████████████▎                                                                                                                                                                      | 749/5680 [2:05:06<18:17:30, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.7221', 'grad_norm': '0.2319', 'learning_rate': '0.0001916', 'ppl': '2.059', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '597.5', 'tokens/total': 6135808, 'tokens/trainable': 6071811, 'epoch': '1.016'}
 13%|█████████████████████████▎                                                                                                                                                                      | 749/5680 [2:05:06<18:17:30, 13.35s/it] 13%|█████████████████████████▎                                                                                                                                                                      | 750/5680 [2:05:19<18:28:11, 13.49s/it]                                                                                                                                                                                                                                             {'loss': '0.766', 'grad_norm': '0.2268', 'learning_rate': '0.0001915', 'ppl': '2.151', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.4', 'tokens/total': 6144000, 'tokens/trainable': 6079967, 'epoch': '1.016'}
 13%|█████████████████████████▎                                                                                                                                                                      | 750/5680 [2:05:19<18:28:11, 13.49s/it] 13%|█████████████████████████▍                                                                                                                                                                      | 751/5680 [2:05:33<18:33:33, 13.56s/it]                                                                                                                                                                                                                                             {'loss': '0.8227', 'grad_norm': '0.3', 'learning_rate': '0.0001915', 'ppl': '2.277', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '593.4', 'tokens/total': 6152192, 'tokens/trainable': 6088102, 'epoch': '1.016'}
 13%|█████████████████████████▍                                                                                                                                                                      | 751/5680 [2:05:33<18:33:33, 13.56s/it] 13%|█████████████████████████▍                                                                                                                                                                      | 752/5680 [2:05:47<18:28:57, 13.50s/it]                                                                                                                                                                                                                                             {'loss': '0.7948', 'grad_norm': '0.2765', 'learning_rate': '0.0001915', 'ppl': '2.214', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '607.8', 'tokens/total': 6160384, 'tokens/trainable': 6096224, 'epoch': '1.016'}
 13%|█████████████████████████▍                                                                                                                                                                      | 752/5680 [2:05:47<18:28:57, 13.50s/it] 13%|█████████████████████████▍                                                                                                                                                                      | 753/5680 [2:06:00<18:17:35, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.7139', 'grad_norm': '0.2152', 'learning_rate': '0.0001915', 'ppl': '2.042', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625.3', 'tokens/total': 6168576, 'tokens/trainable': 6104382, 'epoch': '1.017'}
 13%|█████████████████████████▍                                                                                                                                                                      | 753/5680 [2:06:00<18:17:35, 13.37s/it] 13%|█████████████████████████▍                                                                                                                                                                      | 754/5680 [2:06:13<18:10:39, 13.28s/it]                                                                                                                                                                                                                                             {'loss': '0.8085', 'grad_norm': '0.2448', 'learning_rate': '0.0001915', 'ppl': '2.245', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '620.9', 'tokens/total': 6176768, 'tokens/trainable': 6112505, 'epoch': '1.017'}
 13%|█████████████████████████▍                                                                                                                                                                      | 754/5680 [2:06:13<18:10:39, 13.28s/it] 13%|█████████████████████████▌                                                                                                                                                                      | 755/5680 [2:06:26<17:59:50, 13.16s/it]                                                                                                                                                                                                                                             {'loss': '0.6547', 'grad_norm': '0.2204', 'learning_rate': '0.0001914', 'ppl': '1.925', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '635.8', 'tokens/total': 6184960, 'tokens/trainable': 6120671, 'epoch': '1.017'}
 13%|█████████████████████████▌                                                                                                                                                                      | 755/5680 [2:06:26<17:59:50, 13.16s/it] 13%|█████████████████████████▌                                                                                                                                                                      | 756/5680 [2:06:39<18:04:47, 13.22s/it]                                                                                                                                                                                                                                             {'loss': '0.445', 'grad_norm': '0.1969', 'learning_rate': '0.0001914', 'ppl': '1.561', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '613.4', 'tokens/total': 6193152, 'tokens/trainable': 6128859, 'epoch': '1.017'}
 13%|█████████████████████████▌                                                                                                                                                                      | 756/5680 [2:06:39<18:04:47, 13.22s/it] 13%|█████████████████████████▌                                                                                                                                                                      | 757/5680 [2:06:53<18:19:23, 13.40s/it]                                                                                                                                                                                                                                             {'loss': '0.856', 'grad_norm': '0.2549', 'learning_rate': '0.0001914', 'ppl': '2.354', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.2', 'tokens/total': 6201344, 'tokens/trainable': 6137042, 'epoch': '1.017'}
 13%|█████████████████████████▌                                                                                                                                                                      | 757/5680 [2:06:53<18:19:23, 13.40s/it] 13%|█████████████████████████▌                                                                                                                                                                      | 758/5680 [2:07:06<18:27:37, 13.50s/it]                                                                                                                                                                                                                                             {'loss': '1.004', 'grad_norm': '0.266', 'learning_rate': '0.0001914', 'ppl': '2.729', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '595.9', 'tokens/total': 6209536, 'tokens/trainable': 6145229, 'epoch': '1.017'}
 13%|█████████████████████████▌                                                                                                                                                                      | 758/5680 [2:07:06<18:27:37, 13.50s/it] 13%|█████████████████████████▋                                                                                                                                                                      | 759/5680 [2:07:20<18:26:36, 13.49s/it]                                                                                                                                                                                                                                             {'loss': '0.7066', 'grad_norm': '0.2269', 'learning_rate': '0.0001913', 'ppl': '2.027', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '606', 'tokens/total': 6217728, 'tokens/trainable': 6153388, 'epoch': '1.018'}
 13%|█████████████████████████▋                                                                                                                                                                      | 759/5680 [2:07:20<18:26:36, 13.49s/it] 13%|█████████████████████████▋                                                                                                                                                                      | 760/5680 [2:07:34<18:28:50, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.6034', 'grad_norm': '0.1967', 'learning_rate': '0.0001913', 'ppl': '1.828', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '602.4', 'tokens/total': 6225920, 'tokens/trainable': 6161573, 'epoch': '1.018'}
 13%|█████████████████████████▋                                                                                                                                                                      | 760/5680 [2:07:34<18:28:50, 13.52s/it] 13%|█████████████████████████▋                                                                                                                                                                      | 761/5680 [2:07:47<18:16:56, 13.38s/it]                                                                                                                                                                                                                                             {'loss': '0.8696', 'grad_norm': '0.2446', 'learning_rate': '0.0001913', 'ppl': '2.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '627.5', 'tokens/total': 6234112, 'tokens/trainable': 6169753, 'epoch': '1.018'}
 13%|█████████████████████████▋                                                                                                                                                                      | 761/5680 [2:07:47<18:16:56, 13.38s/it] 13%|█████████████████████████▊                                                                                                                                                                      | 762/5680 [2:07:59<18:04:09, 13.23s/it]                                                                                                                                                                                                                                             {'loss': '0.5942', 'grad_norm': '0.2247', 'learning_rate': '0.0001913', 'ppl': '1.812', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.5', 'tokens/total': 6242304, 'tokens/trainable': 6177847, 'epoch': '1.018'}
 13%|█████████████████████████▊                                                                                                                                                                      | 762/5680 [2:07:59<18:04:09, 13.23s/it] 13%|█████████████████████████▊                                                                                                                                                                      | 763/5680 [2:08:12<17:56:11, 13.13s/it]                                                                                                                                                                                                                                             {'loss': '0.6427', 'grad_norm': '0.2157', 'learning_rate': '0.0001912', 'ppl': '1.902', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632', 'tokens/total': 6250496, 'tokens/trainable': 6186001, 'epoch': '1.018'}
 13%|█████████████████████████▊                                                                                                                                                                      | 763/5680 [2:08:12<17:56:11, 13.13s/it] 13%|█████████████████████████▊                                                                                                                                                                      | 764/5680 [2:08:26<17:59:16, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7986', 'grad_norm': '0.229', 'learning_rate': '0.0001912', 'ppl': '2.222', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '616.8', 'tokens/total': 6258688, 'tokens/trainable': 6194177, 'epoch': '1.018'}
 13%|█████████████████████████▊                                                                                                                                                                      | 764/5680 [2:08:26<17:59:16, 13.17s/it] 13%|█████████████████████████▊                                                                                                                                                                      | 765/5680 [2:08:39<18:15:11, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.6852', 'grad_norm': '0.218', 'learning_rate': '0.0001912', 'ppl': '1.984', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.5', 'tokens/total': 6266880, 'tokens/trainable': 6202270, 'epoch': '1.019'}
 13%|█████████████████████████▊                                                                                                                                                                      | 765/5680 [2:08:39<18:15:11, 13.37s/it] 13%|█████████████████████████▉                                                                                                                                                                      | 766/5680 [2:08:53<18:26:38, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.6206', 'grad_norm': '0.222', 'learning_rate': '0.0001912', 'ppl': '1.86', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.5', 'tokens/total': 6275072, 'tokens/trainable': 6210428, 'epoch': '1.019'}
 13%|█████████████████████████▉                                                                                                                                                                      | 766/5680 [2:08:53<18:26:38, 13.51s/it] 14%|█████████████████████████▉                                                                                                                                                                      | 767/5680 [2:09:07<18:33:57, 13.60s/it]                                                                                                                                                                                                                                             {'loss': '0.695', 'grad_norm': '0.2361', 'learning_rate': '0.0001912', 'ppl': '2.004', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.5', 'tokens/total': 6283264, 'tokens/trainable': 6218583, 'epoch': '1.019'}
 14%|█████████████████████████▉                                                                                                                                                                      | 767/5680 [2:09:07<18:33:57, 13.60s/it] 14%|█████████████████████████▉                                                                                                                                                                      | 768/5680 [2:09:21<18:34:11, 13.61s/it]                                                                                                                                                                                                                                             {'loss': '0.5922', 'grad_norm': '0.1979', 'learning_rate': '0.0001911', 'ppl': '1.808', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '594.7', 'tokens/total': 6291456, 'tokens/trainable': 6226679, 'epoch': '1.019'}
 14%|█████████████████████████▉                                                                                                                                                                      | 768/5680 [2:09:21<18:34:11, 13.61s/it] 14%|█████████████████████████▉                                                                                                                                                                      | 769/5680 [2:09:34<18:18:00, 13.41s/it]                                                                                                                                                                                                                                             {'loss': '0.6352', 'grad_norm': '0.253', 'learning_rate': '0.0001911', 'ppl': '1.887', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '627.6', 'tokens/total': 6299648, 'tokens/trainable': 6234803, 'epoch': '1.019'}
 14%|█████████████████████████▉                                                                                                                                                                      | 769/5680 [2:09:34<18:18:00, 13.41s/it] 14%|██████████████████████████                                                                                                                                                                      | 770/5680 [2:09:47<18:10:27, 13.33s/it]                                                                                                                                                                                                                                             {'loss': '0.6871', 'grad_norm': '0.2388', 'learning_rate': '0.0001911', 'ppl': '1.988', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.1', 'tokens/total': 6307840, 'tokens/trainable': 6242957, 'epoch': '1.02'}
 14%|██████████████████████████                                                                                                                                                                      | 770/5680 [2:09:47<18:10:27, 13.33s/it] 14%|██████████████████████████                                                                                                                                                                      | 771/5680 [2:10:00<18:00:52, 13.21s/it]                                                                                                                                                                                                                                             {'loss': '0.809', 'grad_norm': '0.2491', 'learning_rate': '0.0001911', 'ppl': '2.246', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.1', 'tokens/total': 6316032, 'tokens/trainable': 6251112, 'epoch': '1.02'}
 14%|██████████████████████████                                                                                                                                                                      | 771/5680 [2:10:00<18:00:52, 13.21s/it] 14%|██████████████████████████                                                                                                                                                                      | 772/5680 [2:10:13<17:57:25, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '1.054', 'grad_norm': '0.2656', 'learning_rate': '0.000191', 'ppl': '2.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '617.8', 'tokens/total': 6324224, 'tokens/trainable': 6259191, 'epoch': '1.02'}
 14%|██████████████████████████                                                                                                                                                                      | 772/5680 [2:10:13<17:57:25, 13.17s/it] 14%|██████████████████████████▏                                                                                                                                                                     | 773/5680 [2:10:26<18:07:20, 13.30s/it]                                                                                                                                                                                                                                             {'loss': '0.569', 'grad_norm': '0.2139', 'learning_rate': '0.000191', 'ppl': '1.766', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600', 'tokens/total': 6332416, 'tokens/trainable': 6267341, 'epoch': '1.02'}
 14%|██████████████████████████▏                                                                                                                                                                     | 773/5680 [2:10:26<18:07:20, 13.30s/it] 14%|██████████████████████████▏                                                                                                                                                                     | 774/5680 [2:10:40<18:19:42, 13.45s/it]                                                                                                                                                                                                                                             {'loss': '0.7013', 'grad_norm': '0.231', 'learning_rate': '0.000191', 'ppl': '2.016', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.4', 'tokens/total': 6340608, 'tokens/trainable': 6275474, 'epoch': '1.02'}
 14%|██████████████████████████▏                                                                                                                                                                     | 774/5680 [2:10:40<18:19:42, 13.45s/it] 14%|██████████████████████████▏                                                                                                                                                                     | 775/5680 [2:10:54<18:32:09, 13.60s/it]                                                                                                                                                                                                                                             {'loss': '0.8921', 'grad_norm': '0.2338', 'learning_rate': '0.000191', 'ppl': '2.44', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.2', 'tokens/total': 6348800, 'tokens/trainable': 6283641, 'epoch': '1.02'}
 14%|██████████████████████████▏                                                                                                                                                                     | 775/5680 [2:10:54<18:32:09, 13.60s/it] 14%|██████████████████████████▏                                                                                                                                                                     | 776/5680 [2:11:08<18:27:29, 13.55s/it]                                                                                                                                                                                                                                             {'loss': '1.057', 'grad_norm': '0.2565', 'learning_rate': '0.000191', 'ppl': '2.876', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '606.5', 'tokens/total': 6356992, 'tokens/trainable': 6291779, 'epoch': '1.021'}
 14%|██████████████████████████▏                                                                                                                                                                     | 776/5680 [2:11:08<18:27:29, 13.55s/it] 14%|██████████████████████████▎                                                                                                                                                                     | 777/5680 [2:11:21<18:12:02, 13.36s/it]                                                                                                                                                                                                                                             {'loss': '0.4662', 'grad_norm': '0.1912', 'learning_rate': '0.0001909', 'ppl': '1.594', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.9', 'tokens/total': 6365184, 'tokens/trainable': 6299942, 'epoch': '1.021'}
 14%|██████████████████████████▎                                                                                                                                                                     | 777/5680 [2:11:21<18:12:02, 13.36s/it] 14%|██████████████████████████▎                                                                                                                                                                     | 778/5680 [2:11:33<17:59:35, 13.21s/it]                                                                                                                                                                                                                                             {'loss': '0.6505', 'grad_norm': '0.2542', 'learning_rate': '0.0001909', 'ppl': '1.916', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.8', 'tokens/total': 6373376, 'tokens/trainable': 6308080, 'epoch': '1.021'}
 14%|██████████████████████████▎                                                                                                                                                                     | 778/5680 [2:11:33<17:59:35, 13.21s/it] 14%|██████████████████████████▎                                                                                                                                                                     | 779/5680 [2:11:46<17:49:55, 13.10s/it]                                                                                                                                                                                                                                             {'loss': '0.7398', 'grad_norm': '0.2561', 'learning_rate': '0.0001909', 'ppl': '2.096', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.5', 'tokens/total': 6381568, 'tokens/trainable': 6316175, 'epoch': '1.021'}
 14%|██████████████████████████▎                                                                                                                                                                     | 779/5680 [2:11:46<17:49:55, 13.10s/it] 14%|██████████████████████████▎                                                                                                                                                                     | 780/5680 [2:11:59<17:53:19, 13.14s/it]                                                                                                                                                                                                                                             {'loss': '0.643', 'grad_norm': '0.2355', 'learning_rate': '0.0001909', 'ppl': '1.902', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '619.3', 'tokens/total': 6389760, 'tokens/trainable': 6324360, 'epoch': '1.021'}
 14%|██████████████████████████▎                                                                                                                                                                     | 780/5680 [2:11:59<17:53:19, 13.14s/it] 14%|██████████████████████████▍                                                                                                                                                                     | 781/5680 [2:12:13<18:09:39, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.6069', 'grad_norm': '0.1872', 'learning_rate': '0.0001908', 'ppl': '1.835', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '593', 'tokens/total': 6397952, 'tokens/trainable': 6332550, 'epoch': '1.021'}
 14%|██████████████████████████▍                                                                                                                                                                     | 781/5680 [2:12:13<18:09:39, 13.35s/it] 14%|██████████████████████████▍                                                                                                                                                                     | 782/5680 [2:12:27<18:19:08, 13.46s/it]                                                                                                                                                                                                                                             {'loss': '0.6353', 'grad_norm': '0.2147', 'learning_rate': '0.0001908', 'ppl': '1.888', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.7', 'tokens/total': 6406144, 'tokens/trainable': 6340690, 'epoch': '1.022'}
 14%|██████████████████████████▍                                                                                                                                                                     | 782/5680 [2:12:27<18:19:08, 13.46s/it] 14%|██████████████████████████▍                                                                                                                                                                     | 783/5680 [2:12:41<18:26:28, 13.56s/it]                                                                                                                                                                                                                                             {'loss': '0.5962', 'grad_norm': '0.197', 'learning_rate': '0.0001908', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.3', 'tokens/total': 6414336, 'tokens/trainable': 6348843, 'epoch': '1.022'}
 14%|██████████████████████████▍                                                                                                                                                                     | 783/5680 [2:12:41<18:26:28, 13.56s/it] 14%|██████████████████████████▌                                                                                                                                                                     | 784/5680 [2:12:54<18:23:29, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.8185', 'grad_norm': '0.2244', 'learning_rate': '0.0001908', 'ppl': '2.267', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '606.8', 'tokens/total': 6422528, 'tokens/trainable': 6356995, 'epoch': '1.022'}
 14%|██████████████████████████▌                                                                                                                                                                     | 784/5680 [2:12:54<18:23:29, 13.52s/it] 14%|██████████████████████████▌                                                                                                                                                                     | 785/5680 [2:13:07<18:11:18, 13.38s/it]                                                                                                                                                                                                                                             {'loss': '0.7863', 'grad_norm': '0.2392', 'learning_rate': '0.0001907', 'ppl': '2.195', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '625.7', 'tokens/total': 6430720, 'tokens/trainable': 6365144, 'epoch': '1.022'}
 14%|██████████████████████████▌                                                                                                                                                                     | 785/5680 [2:13:07<18:11:18, 13.38s/it] 14%|██████████████████████████▌                                                                                                                                                                     | 786/5680 [2:13:20<18:00:29, 13.25s/it]                                                                                                                                                                                                                                             {'loss': '0.6747', 'grad_norm': '0.2297', 'learning_rate': '0.0001907', 'ppl': '1.963', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '628.7', 'tokens/total': 6438912, 'tokens/trainable': 6373276, 'epoch': '1.022'}
 14%|██████████████████████████▌                                                                                                                                                                     | 786/5680 [2:13:20<18:00:29, 13.25s/it] 14%|██████████████████████████▌                                                                                                                                                                     | 787/5680 [2:13:33<17:50:54, 13.13s/it]                                                                                                                                                                                                                                             {'loss': '0.6552', 'grad_norm': '0.2231', 'learning_rate': '0.0001907', 'ppl': '1.925', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '637.1', 'tokens/total': 6447104, 'tokens/trainable': 6381465, 'epoch': '1.023'}
 14%|██████████████████████████▌                                                                                                                                                                     | 787/5680 [2:13:33<17:50:54, 13.13s/it] 14%|██████████████████████████▋                                                                                                                                                                     | 788/5680 [2:13:46<17:52:18, 13.15s/it]                                                                                                                                                                                                                                             {'loss': '0.8497', 'grad_norm': '0.2349', 'learning_rate': '0.0001907', 'ppl': '2.339', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '618.8', 'tokens/total': 6455296, 'tokens/trainable': 6389626, 'epoch': '1.023'}
 14%|██████████████████████████▋                                                                                                                                                                     | 788/5680 [2:13:46<17:52:18, 13.15s/it] 14%|██████████████████████████▋                                                                                                                                                                     | 789/5680 [2:14:00<18:07:16, 13.34s/it]                                                                                                                                                                                                                                             {'loss': '0.7784', 'grad_norm': '0.2254', 'learning_rate': '0.0001907', 'ppl': '2.178', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.2', 'tokens/total': 6463488, 'tokens/trainable': 6397753, 'epoch': '1.023'}
 14%|██████████████████████████▋                                                                                                                                                                     | 789/5680 [2:14:00<18:07:16, 13.34s/it] 14%|██████████████████████████▋                                                                                                                                                                     | 790/5680 [2:14:14<18:21:35, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.9861', 'grad_norm': '0.2621', 'learning_rate': '0.0001906', 'ppl': '2.681', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '583.9', 'tokens/total': 6471680, 'tokens/trainable': 6405884, 'epoch': '1.023'}
 14%|██████████████████████████▋                                                                                                                                                                     | 790/5680 [2:14:14<18:21:35, 13.52s/it] 14%|██████████████████████████▋                                                                                                                                                                     | 791/5680 [2:14:28<18:27:09, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6223', 'grad_norm': '0.2378', 'learning_rate': '0.0001906', 'ppl': '1.863', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.8', 'tokens/total': 6479872, 'tokens/trainable': 6414031, 'epoch': '1.023'}
 14%|██████████████████████████▋                                                                                                                                                                     | 791/5680 [2:14:28<18:27:09, 13.59s/it] 14%|██████████████████████████▊                                                                                                                                                                     | 792/5680 [2:14:42<18:31:02, 13.64s/it]                                                                                                                                                                                                                                             {'loss': '0.8051', 'grad_norm': '0.242', 'learning_rate': '0.0001906', 'ppl': '2.237', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.4', 'tokens/total': 6488064, 'tokens/trainable': 6422136, 'epoch': '1.023'}
 14%|██████████████████████████▊                                                                                                                                                                     | 792/5680 [2:14:42<18:31:02, 13.64s/it] 14%|██████████████████████████▊                                                                                                                                                                     | 793/5680 [2:14:54<18:14:32, 13.44s/it]                                                                                                                                                                                                                                             {'loss': '1.015', 'grad_norm': '0.3089', 'learning_rate': '0.0001906', 'ppl': '2.76', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '628.1', 'tokens/total': 6496256, 'tokens/trainable': 6430277, 'epoch': '1.024'}
 14%|██████████████████████████▊                                                                                                                                                                     | 793/5680 [2:14:55<18:14:32, 13.44s/it] 14%|██████████████████████████▊                                                                                                                                                                     | 794/5680 [2:15:07<18:01:02, 13.28s/it]                                                                                                                                                                                                                                             {'loss': '0.7566', 'grad_norm': '0.2408', 'learning_rate': '0.0001905', 'ppl': '2.131', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.6', 'tokens/total': 6504448, 'tokens/trainable': 6438411, 'epoch': '1.024'}
 14%|██████████████████████████▊                                                                                                                                                                     | 794/5680 [2:15:07<18:01:02, 13.28s/it] 14%|██████████████████████████▊                                                                                                                                                                     | 795/5680 [2:15:20<17:52:43, 13.18s/it]                                                                                                                                                                                                                                             {'loss': '1.03', 'grad_norm': '0.2628', 'learning_rate': '0.0001905', 'ppl': '2.8', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.3', 'tokens/total': 6512640, 'tokens/trainable': 6446586, 'epoch': '1.024'}
 14%|██████████████████████████▊                                                                                                                                                                     | 795/5680 [2:15:20<17:52:43, 13.18s/it] 14%|██████████████████████████▉                                                                                                                                                                     | 796/5680 [2:15:33<17:51:44, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7353', 'grad_norm': '0.2183', 'learning_rate': '0.0001905', 'ppl': '2.086', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '619.1', 'tokens/total': 6520832, 'tokens/trainable': 6454717, 'epoch': '1.024'}
 14%|██████████████████████████▉                                                                                                                                                                     | 796/5680 [2:15:33<17:51:44, 13.17s/it] 14%|██████████████████████████▉                                                                                                                                                                     | 797/5680 [2:15:47<18:06:21, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.6777', 'grad_norm': '0.2227', 'learning_rate': '0.0001905', 'ppl': '1.969', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.7', 'tokens/total': 6529024, 'tokens/trainable': 6462850, 'epoch': '1.024'}
 14%|██████████████████████████▉                                                                                                                                                                     | 797/5680 [2:15:47<18:06:21, 13.35s/it] 14%|██████████████████████████▉                                                                                                                                                                     | 798/5680 [2:16:01<18:12:07, 13.42s/it]                                                                                                                                                                                                                                             {'loss': '0.5809', 'grad_norm': '0.2029', 'learning_rate': '0.0001904', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '601.5', 'tokens/total': 6537216, 'tokens/trainable': 6471026, 'epoch': '1.024'}
 14%|██████████████████████████▉                                                                                                                                                                     | 798/5680 [2:16:01<18:12:07, 13.42s/it] 14%|███████████████████████████                                                                                                                                                                     | 799/5680 [2:16:14<18:16:19, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.4566', 'grad_norm': '0.176', 'learning_rate': '0.0001904', 'ppl': '1.579', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.9', 'tokens/total': 6545408, 'tokens/trainable': 6479185, 'epoch': '1.025'}
 14%|███████████████████████████                                                                                                                                                                     | 799/5680 [2:16:14<18:16:19, 13.48s/it] 14%|███████████████████████████                                                                                                                                                                     | 800/5680 [2:16:28<18:14:34, 13.46s/it]                                                                                                                                                                                                                                             {'loss': '0.7983', 'grad_norm': '0.2419', 'learning_rate': '0.0001904', 'ppl': '2.222', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '604.9', 'tokens/total': 6553600, 'tokens/trainable': 6487298, 'epoch': '1.025'}
 14%|███████████████████████████                                                                                                                                                                     | 800/5680 [2:16:28<18:14:34, 13.46s/it] 14%|███████████████████████████                                                                                                                                                                     | 801/5680 [2:16:41<18:00:45, 13.29s/it]                                                                                                                                                                                                                                             {'loss': '0.5751', 'grad_norm': '0.1974', 'learning_rate': '0.0001904', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.9', 'tokens/total': 6561792, 'tokens/trainable': 6495442, 'epoch': '1.025'}
 14%|███████████████████████████                                                                                                                                                                     | 801/5680 [2:16:41<18:00:45, 13.29s/it] 14%|███████████████████████████                                                                                                                                                                     | 802/5680 [2:16:54<17:51:56, 13.19s/it]                                                                                                                                                                                                                                             {'loss': '0.8698', 'grad_norm': '0.2246', 'learning_rate': '0.0001903', 'ppl': '2.387', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '633.4', 'tokens/total': 6569984, 'tokens/trainable': 6503631, 'epoch': '1.025'}
 14%|███████████████████████████                                                                                                                                                                     | 802/5680 [2:16:54<17:51:56, 13.19s/it] 14%|███████████████████████████▏                                                                                                                                                                    | 803/5680 [2:17:07<17:44:18, 13.09s/it]                                                                                                                                                                                                                                             {'loss': '0.7745', 'grad_norm': '0.2286', 'learning_rate': '0.0001903', 'ppl': '2.17', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '628.9', 'tokens/total': 6578176, 'tokens/trainable': 6511728, 'epoch': '1.025'}
 14%|███████████████████████████▏                                                                                                                                                                    | 803/5680 [2:17:07<17:44:18, 13.09s/it] 14%|███████████████████████████▏                                                                                                                                                                    | 804/5680 [2:17:20<17:46:06, 13.12s/it]                                                                                                                                                                                                                                             {'loss': '0.8522', 'grad_norm': '0.2338', 'learning_rate': '0.0001903', 'ppl': '2.345', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '615.5', 'tokens/total': 6586368, 'tokens/trainable': 6519833, 'epoch': '1.026'}
 14%|███████████████████████████▏                                                                                                                                                                    | 804/5680 [2:17:20<17:46:06, 13.12s/it] 14%|███████████████████████████▏                                                                                                                                                                    | 805/5680 [2:17:34<18:02:14, 13.32s/it]                                                                                                                                                                                                                                             {'loss': '0.7037', 'grad_norm': '0.2364', 'learning_rate': '0.0001903', 'ppl': '2.021', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.4', 'tokens/total': 6594560, 'tokens/trainable': 6527972, 'epoch': '1.026'}
 14%|███████████████████████████▏                                                                                                                                                                    | 805/5680 [2:17:34<18:02:14, 13.32s/it] 14%|███████████████████████████▏                                                                                                                                                                    | 806/5680 [2:17:47<18:06:52, 13.38s/it]                                                                                                                                                                                                                                             {'loss': '0.6131', 'grad_norm': '0.1984', 'learning_rate': '0.0001903', 'ppl': '1.846', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.8', 'tokens/total': 6602752, 'tokens/trainable': 6536076, 'epoch': '1.026'}
 14%|███████████████████████████▏                                                                                                                                                                    | 806/5680 [2:17:47<18:06:52, 13.38s/it] 14%|███████████████████████████▎                                                                                                                                                                    | 807/5680 [2:18:01<18:13:53, 13.47s/it]                                                                                                                                                                                                                                             {'loss': '0.6182', 'grad_norm': '0.2153', 'learning_rate': '0.0001902', 'ppl': '1.856', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598.2', 'tokens/total': 6610944, 'tokens/trainable': 6544252, 'epoch': '1.026'}
 14%|███████████████████████████▎                                                                                                                                                                    | 807/5680 [2:18:01<18:13:53, 13.47s/it] 14%|███████████████████████████▎                                                                                                                                                                    | 808/5680 [2:18:14<18:14:35, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '1.249', 'grad_norm': '0.3506', 'learning_rate': '0.0001902', 'ppl': '3.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '605.7', 'tokens/total': 6619136, 'tokens/trainable': 6552431, 'epoch': '1.026'}
 14%|███████████████████████████▎                                                                                                                                                                    | 808/5680 [2:18:14<18:14:35, 13.48s/it] 14%|███████████████████████████▎                                                                                                                                                                    | 809/5680 [2:18:27<18:00:25, 13.31s/it]                                                                                                                                                                                                                                             {'loss': '0.5881', 'grad_norm': '0.2007', 'learning_rate': '0.0001902', 'ppl': '1.801', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.7', 'tokens/total': 6627328, 'tokens/trainable': 6560552, 'epoch': '1.026'}
 14%|███████████████████████████▎                                                                                                                                                                    | 809/5680 [2:18:27<18:00:25, 13.31s/it] 14%|███████████████████████████▍                                                                                                                                                                    | 810/5680 [2:18:40<17:51:50, 13.21s/it]                                                                                                                                                                                                                                             {'loss': '0.6366', 'grad_norm': '0.2137', 'learning_rate': '0.0001902', 'ppl': '1.89', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.6', 'tokens/total': 6635520, 'tokens/trainable': 6568612, 'epoch': '1.027'}
 14%|███████████████████████████▍                                                                                                                                                                    | 810/5680 [2:18:40<17:51:50, 13.21s/it] 14%|███████████████████████████▍                                                                                                                                                                    | 811/5680 [2:18:53<17:45:05, 13.12s/it]                                                                                                                                                                                                                                             {'loss': '0.6191', 'grad_norm': '0.2162', 'learning_rate': '0.0001901', 'ppl': '1.857', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '631.2', 'tokens/total': 6643712, 'tokens/trainable': 6576772, 'epoch': '1.027'}
 14%|███████████████████████████▍                                                                                                                                                                    | 811/5680 [2:18:53<17:45:05, 13.12s/it] 14%|███████████████████████████▍                                                                                                                                                                    | 812/5680 [2:19:06<17:41:47, 13.09s/it]                                                                                                                                                                                                                                             {'loss': '0.8656', 'grad_norm': '0.2206', 'learning_rate': '0.0001901', 'ppl': '2.376', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '628.8', 'tokens/total': 6651904, 'tokens/trainable': 6584940, 'epoch': '1.027'}
 14%|███████████████████████████▍                                                                                                                                                                    | 812/5680 [2:19:06<17:41:47, 13.09s/it] 14%|███████████████████████████▍                                                                                                                                                                    | 813/5680 [2:19:20<17:52:16, 13.22s/it]                                                                                                                                                                                                                                             {'loss': '0.7666', 'grad_norm': '0.4698', 'learning_rate': '0.0001901', 'ppl': '2.152', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '603.5', 'tokens/total': 6660096, 'tokens/trainable': 6593099, 'epoch': '1.027'}
 14%|███████████████████████████▍                                                                                                                                                                    | 813/5680 [2:19:20<17:52:16, 13.22s/it] 14%|███████████████████████████▌                                                                                                                                                                    | 814/5680 [2:19:33<18:01:54, 13.34s/it]                                                                                                                                                                                                                                             {'loss': '0.8919', 'grad_norm': '0.2792', 'learning_rate': '0.0001901', 'ppl': '2.44', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596.3', 'tokens/total': 6668288, 'tokens/trainable': 6601222, 'epoch': '1.027'}
 14%|███████████████████████████▌                                                                                                                                                                    | 814/5680 [2:19:33<18:01:54, 13.34s/it] 14%|███████████████████████████▌                                                                                                                                                                    | 815/5680 [2:19:47<18:12:49, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.5956', 'grad_norm': '0.2164', 'learning_rate': '0.00019', 'ppl': '1.814', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.1', 'tokens/total': 6676480, 'tokens/trainable': 6609387, 'epoch': '1.027'}
 14%|███████████████████████████▌                                                                                                                                                                    | 815/5680 [2:19:47<18:12:49, 13.48s/it] 14%|███████████████████████████▌                                                                                                                                                                    | 816/5680 [2:20:01<18:16:02, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.8466', 'grad_norm': '0.2803', 'learning_rate': '0.00019', 'ppl': '2.332', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600.3', 'tokens/total': 6684672, 'tokens/trainable': 6617559, 'epoch': '1.028'}
 14%|███████████████████████████▌                                                                                                                                                                    | 816/5680 [2:20:01<18:16:02, 13.52s/it] 14%|███████████████████████████▌                                                                                                                                                                    | 817/5680 [2:20:14<18:01:57, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.7517', 'grad_norm': '0.2268', 'learning_rate': '0.00019', 'ppl': '2.121', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '626.7', 'tokens/total': 6692864, 'tokens/trainable': 6625669, 'epoch': '1.028'}
 14%|███████████████████████████▌                                                                                                                                                                    | 817/5680 [2:20:14<18:01:57, 13.35s/it] 14%|███████████████████████████▋                                                                                                                                                                    | 818/5680 [2:20:26<17:51:19, 13.22s/it]                                                                                                                                                                                                                                             {'loss': '0.7434', 'grad_norm': '0.2468', 'learning_rate': '0.00019', 'ppl': '2.103', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.1', 'tokens/total': 6701056, 'tokens/trainable': 6633806, 'epoch': '1.028'}
 14%|███████████████████████████▋                                                                                                                                                                    | 818/5680 [2:20:27<17:51:19, 13.22s/it] 14%|███████████████████████████▋                                                                                                                                                                    | 819/5680 [2:20:39<17:44:02, 13.13s/it]                                                                                                                                                                                                                                             {'loss': '0.9167', 'grad_norm': '0.25', 'learning_rate': '0.0001899', 'ppl': '2.501', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.8', 'tokens/total': 6709248, 'tokens/trainable': 6641983, 'epoch': '1.028'}
 14%|███████████████████████████▋                                                                                                                                                                    | 819/5680 [2:20:39<17:44:02, 13.13s/it] 14%|███████████████████████████▋                                                                                                                                                                    | 820/5680 [2:20:52<17:38:44, 13.07s/it]                                                                                                                                                                                                                                             {'loss': '0.8007', 'grad_norm': '0.2659', 'learning_rate': '0.0001899', 'ppl': '2.227', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.3', 'tokens/total': 6717440, 'tokens/trainable': 6650120, 'epoch': '1.028'}
 14%|███████████████████████████▋                                                                                                                                                                    | 820/5680 [2:20:52<17:38:44, 13.07s/it] 14%|███████████████████████████▊                                                                                                                                                                    | 821/5680 [2:21:06<17:52:20, 13.24s/it]                                                                                                                                                                                                                                             {'loss': '0.5931', 'grad_norm': '0.2071', 'learning_rate': '0.0001899', 'ppl': '1.81', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.1', 'tokens/total': 6725632, 'tokens/trainable': 6658289, 'epoch': '1.029'}
 14%|███████████████████████████▊                                                                                                                                                                    | 821/5680 [2:21:06<17:52:20, 13.24s/it] 14%|███████████████████████████▊                                                                                                                                                                    | 822/5680 [2:21:19<17:55:04, 13.28s/it]                                                                                                                                                                                                                                             {'loss': '0.8181', 'grad_norm': '0.2438', 'learning_rate': '0.0001899', 'ppl': '2.266', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '611.9', 'tokens/total': 6733824, 'tokens/trainable': 6666461, 'epoch': '1.029'}
 14%|███████████████████████████▊                                                                                                                                                                    | 822/5680 [2:21:19<17:55:04, 13.28s/it] 14%|███████████████████████████▊                                                                                                                                                                    | 823/5680 [2:21:33<18:00:15, 13.34s/it]                                                                                                                                                                                                                                             {'loss': '0.7598', 'grad_norm': '0.2592', 'learning_rate': '0.0001898', 'ppl': '2.138', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '606', 'tokens/total': 6742016, 'tokens/trainable': 6674639, 'epoch': '1.029'}
 14%|███████████████████████████▊                                                                                                                                                                    | 823/5680 [2:21:33<18:00:15, 13.34s/it] 15%|███████████████████████████▊                                                                                                                                                                    | 824/5680 [2:21:47<18:13:38, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.7133', 'grad_norm': '0.2206', 'learning_rate': '0.0001898', 'ppl': '2.041', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '585.2', 'tokens/total': 6750208, 'tokens/trainable': 6682775, 'epoch': '1.029'}
 15%|███████████████████████████▊                                                                                                                                                                    | 824/5680 [2:21:47<18:13:38, 13.51s/it] 15%|███████████████████████████▉                                                                                                                                                                    | 825/5680 [2:22:00<18:00:26, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.6175', 'grad_norm': '0.2296', 'learning_rate': '0.0001898', 'ppl': '1.854', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '626', 'tokens/total': 6758400, 'tokens/trainable': 6690894, 'epoch': '1.029'}
 15%|███████████████████████████▉                                                                                                                                                                    | 825/5680 [2:22:00<18:00:26, 13.35s/it] 15%|███████████████████████████▉                                                                                                                                                                    | 826/5680 [2:22:12<17:45:20, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '0.6984', 'grad_norm': '0.242', 'learning_rate': '0.0001898', 'ppl': '2.011', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '637.1', 'tokens/total': 6766592, 'tokens/trainable': 6699004, 'epoch': '1.029'}
 15%|███████████████████████████▉                                                                                                                                                                    | 826/5680 [2:22:12<17:45:20, 13.17s/it] 15%|███████████████████████████▉                                                                                                                                                                    | 827/5680 [2:22:25<17:38:59, 13.09s/it]                                                                                                                                                                                                                                             {'loss': '0.7712', 'grad_norm': '0.2204', 'learning_rate': '0.0001897', 'ppl': '2.162', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.6', 'tokens/total': 6774784, 'tokens/trainable': 6707167, 'epoch': '1.03'}
 15%|███████████████████████████▉                                                                                                                                                                    | 827/5680 [2:22:25<17:38:59, 13.09s/it] 15%|███████████████████████████▉                                                                                                                                                                    | 828/5680 [2:22:38<17:34:58, 13.05s/it]                                                                                                                                                                                                                                             {'loss': '0.703', 'grad_norm': '0.2361', 'learning_rate': '0.0001897', 'ppl': '2.02', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '627.4', 'tokens/total': 6782976, 'tokens/trainable': 6715277, 'epoch': '1.03'}
 15%|███████████████████████████▉                                                                                                                                                                    | 828/5680 [2:22:38<17:34:58, 13.05s/it] 15%|████████████████████████████                                                                                                                                                                    | 829/5680 [2:22:52<17:52:37, 13.27s/it]                                                                                                                                                                                                                                             {'loss': '0.5504', 'grad_norm': '0.2055', 'learning_rate': '0.0001897', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588.7', 'tokens/total': 6791168, 'tokens/trainable': 6723387, 'epoch': '1.03'}
 15%|████████████████████████████                                                                                                                                                                    | 829/5680 [2:22:52<17:52:37, 13.27s/it] 15%|████████████████████████████                                                                                                                                                                    | 830/5680 [2:23:06<18:04:01, 13.41s/it]                                                                                                                                                                                                                                             {'loss': '0.4773', 'grad_norm': '0.2284', 'learning_rate': '0.0001897', 'ppl': '1.612', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.3', 'tokens/total': 6799360, 'tokens/trainable': 6731523, 'epoch': '1.03'}
 15%|████████████████████████████                                                                                                                                                                    | 830/5680 [2:23:06<18:04:01, 13.41s/it] 15%|████████████████████████████                                                                                                                                                                    | 831/5680 [2:23:20<18:12:57, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.4966', 'grad_norm': '0.2085', 'learning_rate': '0.0001896', 'ppl': '1.643', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589.8', 'tokens/total': 6807552, 'tokens/trainable': 6739649, 'epoch': '1.03'}
 15%|████████████████████████████                                                                                                                                                                    | 831/5680 [2:23:20<18:12:57, 13.52s/it] 15%|████████████████████████████                                                                                                                                                                    | 832/5680 [2:23:33<18:16:35, 13.57s/it]                                                                                                                                                                                                                                             {'loss': '0.6202', 'grad_norm': '0.2165', 'learning_rate': '0.0001896', 'ppl': '1.859', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '597.5', 'tokens/total': 6815744, 'tokens/trainable': 6747822, 'epoch': '1.03'}
 15%|████████████████████████████                                                                                                                                                                    | 832/5680 [2:23:33<18:16:35, 13.57s/it] 15%|████████████████████████████▏                                                                                                                                                                   | 833/5680 [2:23:46<17:59:58, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.7571', 'grad_norm': '0.2686', 'learning_rate': '0.0001896', 'ppl': '2.132', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.7', 'tokens/total': 6823936, 'tokens/trainable': 6755950, 'epoch': '1.031'}
 15%|████████████████████████████▏                                                                                                                                                                   | 833/5680 [2:23:46<17:59:58, 13.37s/it] 15%|████████████████████████████▏                                                                                                                                                                   | 834/5680 [2:23:59<17:51:59, 13.27s/it]                                                                                                                                                                                                                                             {'loss': '0.918', 'grad_norm': '0.2339', 'learning_rate': '0.0001896', 'ppl': '2.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '626.9', 'tokens/total': 6832128, 'tokens/trainable': 6764123, 'epoch': '1.031'}
 15%|████████████████████████████▏                                                                                                                                                                   | 834/5680 [2:23:59<17:51:59, 13.27s/it] 15%|████████████████████████████▏                                                                                                                                                                   | 835/5680 [2:24:13<18:01:25, 13.39s/it]                                                                                                                                                                                                                                             {'loss': '0.8155', 'grad_norm': '0.2401', 'learning_rate': '0.0001895', 'ppl': '2.26', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '599.3', 'tokens/total': 6840320, 'tokens/trainable': 6772313, 'epoch': '1.031'}
 15%|████████████████████████████▏                                                                                                                                                                   | 835/5680 [2:24:13<18:01:25, 13.39s/it] 15%|████████████████████████████▎                                                                                                                                                                   | 836/5680 [2:24:26<17:47:00, 13.22s/it]                                                                                                                                                                                                                                             {'loss': '0.3942', 'grad_norm': '0.1818', 'learning_rate': '0.0001895', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '635.3', 'tokens/total': 6848512, 'tokens/trainable': 6780443, 'epoch': '1.031'}
 15%|████████████████████████████▎                                                                                                                                                                   | 836/5680 [2:24:26<17:47:00, 13.22s/it] 15%|████████████████████████████▎                                                                                                                                                                   | 837/5680 [2:24:39<17:43:16, 13.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7521', 'grad_norm': '0.2437', 'learning_rate': '0.0001895', 'ppl': '2.121', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.5', 'tokens/total': 6856704, 'tokens/trainable': 6788572, 'epoch': '1.031'}
 15%|████████████████████████████▎                                                                                                                                                                   | 837/5680 [2:24:39<17:43:16, 13.17s/it] 15%|████████████████████████████▎                                                                                                                                                                   | 838/5680 [2:24:53<17:59:32, 13.38s/it]                                                                                                                                                                                                                                             {'loss': '0.4752', 'grad_norm': '0.2054', 'learning_rate': '0.0001895', 'ppl': '1.608', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.6', 'tokens/total': 6864896, 'tokens/trainable': 6796763, 'epoch': '1.032'}
 15%|████████████████████████████▎                                                                                                                                                                   | 838/5680 [2:24:53<17:59:32, 13.38s/it] 15%|████████████████████████████▎                                                                                                                                                                   | 839/5680 [2:25:06<18:07:53, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.4194', 'grad_norm': '0.1893', 'learning_rate': '0.0001894', 'ppl': '1.521', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '593.9', 'tokens/total': 6873088, 'tokens/trainable': 6804913, 'epoch': '1.032'}
 15%|████████████████████████████▎                                                                                                                                                                   | 839/5680 [2:25:06<18:07:53, 13.48s/it] 15%|████████████████████████████▍                                                                                                                                                                   | 840/5680 [2:25:20<18:15:38, 13.58s/it]                                                                                                                                                                                                                                             {'loss': '0.6344', 'grad_norm': '0.2854', 'learning_rate': '0.0001894', 'ppl': '1.886', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '588.9', 'tokens/total': 6881280, 'tokens/trainable': 6813042, 'epoch': '1.032'}
 15%|████████████████████████████▍                                                                                                                                                                   | 840/5680 [2:25:20<18:15:38, 13.58s/it] 15%|████████████████████████████▍                                                                                                                                                                   | 841/5680 [2:25:34<18:10:56, 13.53s/it]                                                                                                                                                                                                                                             {'loss': '0.6602', 'grad_norm': '0.2171', 'learning_rate': '0.0001894', 'ppl': '1.935', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '608.3', 'tokens/total': 6889472, 'tokens/trainable': 6821189, 'epoch': '1.032'}
 15%|████████████████████████████▍                                                                                                                                                                   | 841/5680 [2:25:34<18:10:56, 13.53s/it] 15%|████████████████████████████▍                                                                                                                                                                   | 842/5680 [2:25:47<17:56:09, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.5955', 'grad_norm': '0.1958', 'learning_rate': '0.0001894', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.1', 'tokens/total': 6897664, 'tokens/trainable': 6829347, 'epoch': '1.032'}
 15%|████████████████████████████▍                                                                                                                                                                   | 842/5680 [2:25:47<17:56:09, 13.35s/it] 15%|████████████████████████████▍                                                                                                                                                                   | 843/5680 [2:25:59<17:42:21, 13.18s/it]                                                                                                                                                                                                                                             {'loss': '1.095', 'grad_norm': '0.273', 'learning_rate': '0.0001894', 'ppl': '2.989', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '640', 'tokens/total': 6905856, 'tokens/trainable': 6837522, 'epoch': '1.032'}
 15%|████████████████████████████▍                                                                                                                                                                   | 843/5680 [2:25:59<17:42:21, 13.18s/it] 15%|████████████████████████████▌                                                                                                                                                                   | 844/5680 [2:26:12<17:35:51, 13.10s/it]                                                                                                                                                                                                                                             {'loss': '0.7157', 'grad_norm': '0.218', 'learning_rate': '0.0001893', 'ppl': '2.046', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.4', 'tokens/total': 6914048, 'tokens/trainable': 6845644, 'epoch': '1.033'}
 15%|████████████████████████████▌                                                                                                                                                                   | 844/5680 [2:26:12<17:35:51, 13.10s/it] 15%|████████████████████████████▌                                                                                                                                                                   | 845/5680 [2:26:25<17:34:48, 13.09s/it]                                                                                                                                                                                                                                             {'loss': '0.9823', 'grad_norm': '0.243', 'learning_rate': '0.0001893', 'ppl': '2.671', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '623.6', 'tokens/total': 6922240, 'tokens/trainable': 6853781, 'epoch': '1.033'}
 15%|████████████████████████████▌                                                                                                                                                                   | 845/5680 [2:26:25<17:34:48, 13.09s/it] 15%|████████████████████████████▌                                                                                                                                                                   | 846/5680 [2:26:39<17:50:57, 13.29s/it]                                                                                                                                                                                                                                             {'loss': '1.014', 'grad_norm': '0.2586', 'learning_rate': '0.0001893', 'ppl': '2.755', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.6', 'tokens/total': 6930432, 'tokens/trainable': 6861921, 'epoch': '1.033'}
 15%|████████████████████████████▌                                                                                                                                                                   | 846/5680 [2:26:39<17:50:57, 13.29s/it] 15%|████████████████████████████▋                                                                                                                                                                   | 847/5680 [2:26:53<18:03:16, 13.45s/it]                                                                                                                                                                                                                                             {'loss': '0.6477', 'grad_norm': '0.2093', 'learning_rate': '0.0001893', 'ppl': '1.911', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.2', 'tokens/total': 6938624, 'tokens/trainable': 6870068, 'epoch': '1.033'}
 15%|████████████████████████████▋                                                                                                                                                                   | 847/5680 [2:26:53<18:03:16, 13.45s/it] 15%|████████████████████████████▋                                                                                                                                                                   | 848/5680 [2:27:07<18:14:23, 13.59s/it]                                                                                                                                                                                                                                             {'loss': '0.5123', 'grad_norm': '0.1996', 'learning_rate': '0.0001892', 'ppl': '1.669', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '580.8', 'tokens/total': 6946816, 'tokens/trainable': 6878148, 'epoch': '1.033'}
 15%|████████████████████████████▋                                                                                                                                                                   | 848/5680 [2:27:07<18:14:23, 13.59s/it] 15%|████████████████████████████▋                                                                                                                                                                   | 849/5680 [2:27:20<18:06:17, 13.49s/it]                                                                                                                                                                                                                                             {'loss': '0.823', 'grad_norm': '0.2525', 'learning_rate': '0.0001892', 'ppl': '2.277', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '616.6', 'tokens/total': 6955008, 'tokens/trainable': 6886318, 'epoch': '1.033'}
 15%|████████████████████████████▋                                                                                                                                                                   | 849/5680 [2:27:20<18:06:17, 13.49s/it] 15%|████████████████████████████▋                                                                                                                                                                   | 850/5680 [2:27:33<17:50:05, 13.29s/it]                                                                                                                                                                                                                                             {'loss': '0.6974', 'grad_norm': '0.2443', 'learning_rate': '0.0001892', 'ppl': '2.009', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.2', 'tokens/total': 6963200, 'tokens/trainable': 6894473, 'epoch': '1.034'}
 15%|████████████████████████████▋                                                                                                                                                                   | 850/5680 [2:27:33<17:50:05, 13.29s/it] 15%|████████████████████████████▊                                                                                                                                                                   | 851/5680 [2:27:46<17:37:42, 13.14s/it]                                                                                                                                                                                                                                             {'loss': '1.148', 'grad_norm': '0.2922', 'learning_rate': '0.0001892', 'ppl': '3.153', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '637.4', 'tokens/total': 6971392, 'tokens/trainable': 6902624, 'epoch': '1.034'}
 15%|████████████████████████████▊                                                                                                                                                                   | 851/5680 [2:27:46<17:37:42, 13.14s/it] 15%|████████████████████████████▊                                                                                                                                                                   | 852/5680 [2:27:59<17:32:18, 13.08s/it]                                                                                                                                                                                                                                             {'loss': '0.7876', 'grad_norm': '0.2635', 'learning_rate': '0.0001891', 'ppl': '2.198', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.7', 'tokens/total': 6979584, 'tokens/trainable': 6910797, 'epoch': '1.034'}
 15%|████████████████████████████▊                                                                                                                                                                   | 852/5680 [2:27:59<17:32:18, 13.08s/it] 15%|████████████████████████████▊                                                                                                                                                                   | 853/5680 [2:28:12<17:31:31, 13.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6613', 'grad_norm': '0.2273', 'learning_rate': '0.0001891', 'ppl': '1.937', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '623.6', 'tokens/total': 6987776, 'tokens/trainable': 6918932, 'epoch': '1.034'}
 15%|████████████████████████████▊                                                                                                                                                                   | 853/5680 [2:28:12<17:31:31, 13.07s/it] 15%|████████████████████████████▊                                                                                                                                                                   | 854/5680 [2:28:25<17:47:39, 13.27s/it]                                                                                                                                                                                                                                             {'loss': '0.7494', 'grad_norm': '0.2343', 'learning_rate': '0.0001891', 'ppl': '2.116', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '593.5', 'tokens/total': 6995968, 'tokens/trainable': 6927085, 'epoch': '1.034'}
 15%|████████████████████████████▊                                                                                                                                                                   | 854/5680 [2:28:25<17:47:39, 13.27s/it] 15%|████████████████████████████▉                                                                                                                                                                   | 855/5680 [2:28:39<17:53:45, 13.35s/it]                                                                                                                                                                                                                                             {'loss': '0.4722', 'grad_norm': '0.2011', 'learning_rate': '0.0001891', 'ppl': '1.603', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '600.4', 'tokens/total': 7004160, 'tokens/trainable': 6935209, 'epoch': '1.035'}
 15%|████████████████████████████▉                                                                                                                                                                   | 855/5680 [2:28:39<17:53:45, 13.35s/it] 15%|████████████████████████████▉                                                                                                                                                                   | 856/5680 [2:28:53<18:00:51, 13.44s/it]                                                                                                                                                                                                                                             {'loss': '0.5061', 'grad_norm': '0.2079', 'learning_rate': '0.000189', 'ppl': '1.659', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '598.2', 'tokens/total': 7012352, 'tokens/trainable': 6943368, 'epoch': '1.035'}
 15%|████████████████████████████▉                                                                                                                                                                   | 856/5680 [2:28:53<18:00:51, 13.44s/it] 15%|████████████████████████████▉                                                                                                                                                                   | 857/5680 [2:29:06<18:02:17, 13.46s/it]                                                                                                                                                                                                                                             {'loss': '0.8274', 'grad_norm': '0.2284', 'learning_rate': '0.000189', 'ppl': '2.287', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '606.2', 'tokens/total': 7020544, 'tokens/trainable': 6951553, 'epoch': '1.035'}
 15%|████████████████████████████▉                                                                                                                                                                   | 857/5680 [2:29:06<18:02:17, 13.46s/it] 15%|█████████████████████████████                                                                                                                                                                   | 858/5680 [2:29:19<17:49:21, 13.31s/it]                                                                                                                                                                                                                                             {'loss': '0.652', 'grad_norm': '0.2315', 'learning_rate': '0.000189', 'ppl': '1.919', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '632.9', 'tokens/total': 7028736, 'tokens/trainable': 6959736, 'epoch': '1.035'}
 15%|█████████████████████████████                                                                                                                                                                   | 858/5680 [2:29:19<17:49:21, 13.31s/it] 15%|█████████████████████████████                                                                                                                                                                   | 859/5680 [2:29:32<17:36:51, 13.15s/it]                                                                                                                                                                                                                                             {'loss': '0.5362', 'grad_norm': '0.2135', 'learning_rate': '0.0001889', 'ppl': '1.71', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '638.2', 'tokens/total': 7036928, 'tokens/trainable': 6967896, 'epoch': '1.035'}
 15%|█████████████████████████████                                                                                                                                                                   | 859/5680 [2:29:32<17:36:51, 13.15s/it] 15%|█████████████████████████████                                                                                                                                                                   | 860/5680 [2:29:45<17:29:32, 13.06s/it]                                                                                                                                                                                                                                             {'loss': '0.8016', 'grad_norm': '0.2472', 'learning_rate': '0.0001889', 'ppl': '2.229', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.6', 'tokens/total': 7045120, 'tokens/trainable': 6976047, 'epoch': '1.035'}
 15%|█████████████████████████████                                                                                                                                                                   | 860/5680 [2:29:45<17:29:32, 13.06s/it] 15%|█████████████████████████████                                                                                                                                                                   | 861/5680 [2:29:58<17:32:10, 13.10s/it]                                                                                                                                                                                                                                             {'loss': '0.6269', 'grad_norm': '0.2179', 'learning_rate': '0.0001889', 'ppl': '1.872', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '617.9', 'tokens/total': 7053312, 'tokens/trainable': 6984187, 'epoch': '1.036'}
 15%|█████████████████████████████                                                                                                                                                                   | 861/5680 [2:29:58<17:32:10, 13.10s/it] 15%|█████████████████████████████▏                                                                                                                                                                  | 862/5680 [2:30:12<17:53:27, 13.37s/it]                                                                                                                                                                                                                                             {'loss': '0.7071', 'grad_norm': '0.2273', 'learning_rate': '0.0001889', 'ppl': '2.028', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '582.2', 'tokens/total': 7061504, 'tokens/trainable': 6992324, 'epoch': '1.036'}
 15%|█████████████████████████████▏                                                                                                                                                                  | 862/5680 [2:30:12<17:53:27, 13.37s/it] 15%|█████████████████████████████▏                                                                                                                                                                  | 863/5680 [2:30:26<18:02:16, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.6552', 'grad_norm': '0.2448', 'learning_rate': '0.0001888', 'ppl': '1.926', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '590.2', 'tokens/total': 7069696, 'tokens/trainable': 7000434, 'epoch': '1.036'}
 15%|█████████████████████████████▏                                                                                                                                                                  | 863/5680 [2:30:26<18:02:16, 13.48s/it] 15%|█████████████████████████████▏                                                                                                                                                                  | 864/5680 [2:30:39<18:07:26, 13.55s/it]                                                                                                                                                                                                                                             {'loss': '1.189', 'grad_norm': '0.3252', 'learning_rate': '0.0001888', 'ppl': '3.285', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '594.8', 'tokens/total': 7077888, 'tokens/trainable': 7008582, 'epoch': '1.036'}
 15%|█████████████████████████████▏                                                                                                                                                                  | 864/5680 [2:30:39<18:07:26, 13.55s/it] 15%|█████████████████████████████▏                                                                                                                                                                  | 865/5680 [2:30:53<18:04:11, 13.51s/it]                                                                                                                                                                                                                                             {'loss': '0.7562', 'grad_norm': '0.2475', 'learning_rate': '0.0001888', 'ppl': '2.13', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '607.6', 'tokens/total': 7086080, 'tokens/trainable': 7016734, 'epoch': '1.036'}
 15%|█████████████████████████████▏                                                                                                                                                                  | 865/5680 [2:30:53<18:04:11, 13.51s/it] 15%|█████████████████████████████▎                                                                                                                                                                  | 866/5680 [2:31:06<17:59:29, 13.45s/it]                                                                                                                                                                                                                                             {'loss': '0.569', 'grad_norm': '0.2106', 'learning_rate': '0.0001888', 'ppl': '1.767', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '611.4', 'tokens/total': 7094272, 'tokens/trainable': 7024878, 'epoch': '1.036'}
 15%|█████████████████████████████▎                                                                                                                                                                  | 866/5680 [2:31:06<17:59:29, 13.45s/it] 15%|█████████████████████████████▎                                                                                                                                                                  | 867/5680 [2:31:19<17:45:01, 13.28s/it]                                                                                                                                                                                                                                             {'loss': '0.7002', 'grad_norm': '0.2303', 'learning_rate': '0.0001887', 'ppl': '2.014', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '633.7', 'tokens/total': 7102464, 'tokens/trainable': 7033021, 'epoch': '1.037'}
 15%|█████████████████████████████▎                                                                                                                                                                  | 867/5680 [2:31:19<17:45:01, 13.28s/it] 15%|█████████████████████████████▎                                                                                                                                                                  | 868/5680 [2:31:32<17:39:29, 13.21s/it]                                                                                                                                                                                                                                             {'loss': '0.5469', 'grad_norm': '0.231', 'learning_rate': '0.0001887', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '624', 'tokens/total': 7110656, 'tokens/trainable': 7041162, 'epoch': '1.037'}
 15%|█████████████████████████████▎                                                                                                                                                                  | 868/5680 [2:31:32<17:39:29, 13.21s/it] 15%|█████████████████████████████▎                                                                                                                                                                  | 869/5680 [2:31:45<17:32:33, 13.13s/it]                                                                                                                                                                                                                                             {'loss': '0.5429', 'grad_norm': '0.2143', 'learning_rate': '0.0001887', 'ppl': '1.721', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630', 'tokens/total': 7118848, 'tokens/trainable': 7049301, 'epoch': '1.037'}
 15%|█████████████████████████████▎                                                                                                                                                                  | 869/5680 [2:31:45<17:32:33, 13.13s/it] 15%|█████████████████████████████▍                                                                                                                                                                  | 870/5680 [2:31:59<17:47:10, 13.31s/it]                                                                                                                                                                                                                                             {'loss': '0.5341', 'grad_norm': '0.2103', 'learning_rate': '0.0001887', 'ppl': '1.706', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.3', 'tokens/total': 7127040, 'tokens/trainable': 7057435, 'epoch': '1.037'}
 15%|█████████████████████████████▍                                                                                                                                                                  | 870/5680 [2:31:59<17:47:10, 13.31s/it] 15%|█████████████████████████████▍                                                                                                                                                                  | 871/5680 [2:32:12<17:58:03, 13.45s/it]                                                                                                                                                                                                                                             {'loss': '0.7642', 'grad_norm': '0.2247', 'learning_rate': '0.0001886', 'ppl': '2.147', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '594.6', 'tokens/total': 7135232, 'tokens/trainable': 7065623, 'epoch': '1.037'}
 15%|█████████████████████████████▍                                                                                                                                                                  | 871/5680 [2:32:12<17:58:03, 13.45s/it] 15%|█████████████████████████████▍                                                                                                                                                                  | 872/5680 [2:32:26<18:03:23, 13.52s/it]                                                                                                                                                                                                                                             {'loss': '0.7365', 'grad_norm': '0.2379', 'learning_rate': '0.0001886', 'ppl': '2.089', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '596', 'tokens/total': 7143424, 'tokens/trainable': 7073774, 'epoch': '1.038'}
 15%|█████████████████████████████▍                                                                                                                                                                  | 872/5680 [2:32:26<18:03:23, 13.52s/it] 15%|█████████████████████████████▌                                                                                                                                                                  | 873/5680 [2:32:40<18:10:27, 13.61s/it]                                                                                                                                                                                                                                             {'loss': '0.6075', 'grad_norm': '0.2257', 'learning_rate': '0.0001886', 'ppl': '1.836', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '592.4', 'tokens/total': 7151616, 'tokens/trainable': 7081956, 'epoch': '1.038'}
 15%|█████████████████████████████▌                                                                                                                                                                  | 873/5680 [2:32:40<18:10:27, 13.61s/it] 15%|█████████████████████████████▌                                                                                                                                                                  | 874/5680 [2:32:53<17:59:54, 13.48s/it]                                                                                                                                                                                                                                             {'loss': '0.5959', 'grad_norm': '0.208', 'learning_rate': '0.0001886', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '612.4', 'tokens/total': 7159808, 'tokens/trainable': 7090022, 'epoch': '1.038'}
 15%|█████████████████████████████▌                                                                                                                                                                  | 874/5680 [2:32:53<17:59:54, 13.48s/it] 15%|█████████████████████████████▌                                                                                                                                                                  | 875/5680 [2:33:06<17:44:48, 13.30s/it]                                                                                                                                                                                                                                             {'loss': '0.5502', 'grad_norm': '0.2153', 'learning_rate': '0.0001885', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.8', 'tokens/total': 7168000, 'tokens/trainable': 7098207, 'epoch': '1.038'}
 15%|█████████████████████████████▌                                                                                                                                                                  | 875/5680 [2:33:06<17:44:48, 13.30s/it] 15%|█████████████████████████████▌                                                                                                                                                                  | 876/5680 [2:33:16<16:28:25, 12.34s/it]                                                                                                                                                                                                                                             {'loss': '0.628', 'grad_norm': '0.1935', 'learning_rate': '0.0001885', 'ppl': '1.874', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '805.3', 'tokens/total': 7176192, 'tokens/trainable': 7106351, 'epoch': '1.038'}
 15%|█████████████████████████████▌                                                                                                                                                                  | 876/5680 [2:33:16<16:28:25, 12.34s/it] 15%|█████████████████████████████▋                                                                                                                                                                  | 877/5680 [2:33:24<14:41:36, 11.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5818', 'grad_norm': '0.2407', 'learning_rate': '0.0001885', 'ppl': '1.789', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 7184384, 'tokens/trainable': 7114513, 'epoch': '1.038'}
 15%|█████████████████████████████▋                                                                                                                                                                  | 877/5680 [2:33:24<14:41:36, 11.01s/it] 15%|█████████████████████████████▋                                                                                                                                                                  | 878/5680 [2:33:32<13:25:17, 10.06s/it]                                                                                                                                                                                                                                             {'loss': '0.8925', 'grad_norm': '0.2592', 'learning_rate': '0.0001885', 'ppl': '2.441', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7192576, 'tokens/trainable': 7122608, 'epoch': '1.039'}
 15%|█████████████████████████████▋                                                                                                                                                                  | 878/5680 [2:33:32<13:25:17, 10.06s/it] 15%|█████████████████████████████▋                                                                                                                                                                  | 879/5680 [2:33:40<12:31:55,  9.40s/it]                                                                                                                                                                                                                                             {'loss': '0.8472', 'grad_norm': '0.2736', 'learning_rate': '0.0001884', 'ppl': '2.333', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 7200768, 'tokens/trainable': 7130797, 'epoch': '1.039'}
 15%|█████████████████████████████▋                                                                                                                                                                  | 879/5680 [2:33:40<12:31:55,  9.40s/it] 15%|█████████████████████████████▋                                                                                                                                                                  | 880/5680 [2:33:48<11:55:48,  8.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7625', 'grad_norm': '0.2343', 'learning_rate': '0.0001884', 'ppl': '2.144', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 7208960, 'tokens/trainable': 7138972, 'epoch': '1.039'}
 15%|█████████████████████████████▋                                                                                                                                                                  | 880/5680 [2:33:48<11:55:48,  8.95s/it] 16%|█████████████████████████████▊                                                                                                                                                                  | 881/5680 [2:33:55<11:28:53,  8.61s/it]                                                                                                                                                                                                                                             {'loss': '0.7977', 'grad_norm': '0.2758', 'learning_rate': '0.0001884', 'ppl': '2.22', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 7217152, 'tokens/trainable': 7147112, 'epoch': '1.039'}
 16%|█████████████████████████████▊                                                                                                                                                                  | 881/5680 [2:33:55<11:28:53,  8.61s/it] 16%|█████████████████████████████▊                                                                                                                                                                  | 882/5680 [2:34:03<11:12:21,  8.41s/it]                                                                                                                                                                                                                                             {'loss': '0.8671', 'grad_norm': '0.2401', 'learning_rate': '0.0001884', 'ppl': '2.38', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 7225344, 'tokens/trainable': 7155232, 'epoch': '1.039'}
 16%|█████████████████████████████▊                                                                                                                                                                  | 882/5680 [2:34:03<11:12:21,  8.41s/it] 16%|█████████████████████████████▊                                                                                                                                                                  | 883/5680 [2:34:11<10:58:53,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.4676', 'grad_norm': '0.1843', 'learning_rate': '0.0001883', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 7233536, 'tokens/trainable': 7163404, 'epoch': '1.039'}
 16%|█████████████████████████████▊                                                                                                                                                                  | 883/5680 [2:34:11<10:58:53,  8.24s/it] 16%|█████████████████████████████▉                                                                                                                                                                  | 884/5680 [2:34:19<10:49:35,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.9232', 'grad_norm': '0.2922', 'learning_rate': '0.0001883', 'ppl': '2.517', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 7241728, 'tokens/trainable': 7171519, 'epoch': '1.04'}
 16%|█████████████████████████████▉                                                                                                                                                                  | 884/5680 [2:34:19<10:49:35,  8.13s/it] 16%|█████████████████████████████▉                                                                                                                                                                  | 885/5680 [2:34:27<10:43:11,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.7156', 'grad_norm': '0.2168', 'learning_rate': '0.0001883', 'ppl': '2.045', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 7249920, 'tokens/trainable': 7179691, 'epoch': '1.04'}
 16%|█████████████████████████████▉                                                                                                                                                                  | 885/5680 [2:34:27<10:43:11,  8.05s/it] 16%|█████████████████████████████▉                                                                                                                                                                  | 886/5680 [2:34:35<10:38:27,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '1.109', 'grad_norm': '0.2425', 'learning_rate': '0.0001883', 'ppl': '3.032', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7258112, 'tokens/trainable': 7187815, 'epoch': '1.04'}
 16%|█████████████████████████████▉                                                                                                                                                                  | 886/5680 [2:34:35<10:38:27,  7.99s/it] 16%|█████████████████████████████▉                                                                                                                                                                  | 887/5680 [2:34:43<10:35:38,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7328', 'grad_norm': '0.2737', 'learning_rate': '0.0001882', 'ppl': '2.081', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 7266304, 'tokens/trainable': 7196003, 'epoch': '1.04'}
 16%|█████████████████████████████▉                                                                                                                                                                  | 887/5680 [2:34:43<10:35:38,  7.96s/it] 16%|██████████████████████████████                                                                                                                                                                  | 888/5680 [2:34:51<10:33:40,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6685', 'grad_norm': '0.2117', 'learning_rate': '0.0001882', 'ppl': '1.951', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7274496, 'tokens/trainable': 7204173, 'epoch': '1.04'}
 16%|██████████████████████████████                                                                                                                                                                  | 888/5680 [2:34:51<10:33:40,  7.93s/it] 16%|██████████████████████████████                                                                                                                                                                  | 889/5680 [2:34:58<10:32:33,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6697', 'grad_norm': '0.2101', 'learning_rate': '0.0001882', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7282688, 'tokens/trainable': 7212329, 'epoch': '1.04'}
 16%|██████████████████████████████                                                                                                                                                                  | 889/5680 [2:34:58<10:32:33,  7.92s/it] 16%|██████████████████████████████                                                                                                                                                                  | 890/5680 [2:35:06<10:33:23,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6657', 'grad_norm': '0.2375', 'learning_rate': '0.0001882', 'ppl': '1.946', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 7290880, 'tokens/trainable': 7220516, 'epoch': '1.041'}
 16%|██████████████████████████████                                                                                                                                                                  | 890/5680 [2:35:06<10:33:23,  7.93s/it] 16%|██████████████████████████████                                                                                                                                                                  | 891/5680 [2:35:14<10:33:17,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6005', 'grad_norm': '0.2206', 'learning_rate': '0.0001881', 'ppl': '1.823', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 7299072, 'tokens/trainable': 7228674, 'epoch': '1.041'}
 16%|██████████████████████████████                                                                                                                                                                  | 891/5680 [2:35:14<10:33:17,  7.93s/it] 16%|██████████████████████████████▏                                                                                                                                                                 | 892/5680 [2:35:22<10:32:09,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.7074', 'grad_norm': '0.2189', 'learning_rate': '0.0001881', 'ppl': '2.029', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 7307264, 'tokens/trainable': 7236744, 'epoch': '1.041'}
 16%|██████████████████████████████▏                                                                                                                                                                 | 892/5680 [2:35:22<10:32:09,  7.92s/it] 16%|██████████████████████████████▏                                                                                                                                                                 | 893/5680 [2:35:30<10:32:43,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5978', 'grad_norm': '0.2165', 'learning_rate': '0.0001881', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 7315456, 'tokens/trainable': 7244896, 'epoch': '1.041'}
 16%|██████████████████████████████▏                                                                                                                                                                 | 893/5680 [2:35:30<10:32:43,  7.93s/it] 16%|██████████████████████████████▏                                                                                                                                                                 | 894/5680 [2:35:38<10:32:49,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.8954', 'grad_norm': '0.2968', 'learning_rate': '0.000188', 'ppl': '2.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 7323648, 'tokens/trainable': 7253001, 'epoch': '1.041'}
 16%|██████████████████████████████▏                                                                                                                                                                 | 894/5680 [2:35:38<10:32:49,  7.93s/it] 16%|██████████████████████████████▎                                                                                                                                                                 | 895/5680 [2:35:46<10:31:48,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9302', 'grad_norm': '0.2602', 'learning_rate': '0.000188', 'ppl': '2.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7331840, 'tokens/trainable': 7261188, 'epoch': '1.042'}
 16%|██████████████████████████████▎                                                                                                                                                                 | 895/5680 [2:35:46<10:31:48,  7.92s/it] 16%|██████████████████████████████▎                                                                                                                                                                 | 896/5680 [2:35:54<10:31:13,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.844', 'grad_norm': '0.2766', 'learning_rate': '0.000188', 'ppl': '2.326', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7340032, 'tokens/trainable': 7269340, 'epoch': '1.042'}
 16%|██████████████████████████████▎                                                                                                                                                                 | 896/5680 [2:35:54<10:31:13,  7.92s/it] 16%|██████████████████████████████▎                                                                                                                                                                 | 897/5680 [2:36:02<10:30:24,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5592', 'grad_norm': '0.2131', 'learning_rate': '0.000188', 'ppl': '1.749', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 7348224, 'tokens/trainable': 7277390, 'epoch': '1.042'}
 16%|██████████████████████████████▎                                                                                                                                                                 | 897/5680 [2:36:02<10:30:24,  7.91s/it] 16%|██████████████████████████████▎                                                                                                                                                                 | 898/5680 [2:36:10<10:30:06,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6697', 'grad_norm': '0.2231', 'learning_rate': '0.0001879', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7356416, 'tokens/trainable': 7285556, 'epoch': '1.042'}
 16%|██████████████████████████████▎                                                                                                                                                                 | 898/5680 [2:36:10<10:30:06,  7.91s/it] 16%|██████████████████████████████▍                                                                                                                                                                 | 899/5680 [2:36:18<10:29:56,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8207', 'grad_norm': '0.2888', 'learning_rate': '0.0001879', 'ppl': '2.272', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 7364608, 'tokens/trainable': 7293741, 'epoch': '1.042'}
 16%|██████████████████████████████▍                                                                                                                                                                 | 899/5680 [2:36:18<10:29:56,  7.91s/it] 16%|██████████████████████████████▍                                                                                                                                                                 | 900/5680 [2:36:25<10:27:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9069', 'grad_norm': '0.2758', 'learning_rate': '0.0001879', 'ppl': '2.477', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 7372800, 'tokens/trainable': 7301909, 'epoch': '1.042'}
 16%|██████████████████████████████▍                                                                                                                                                                 | 900/5680 [2:36:25<10:27:59,  7.88s/it] 16%|██████████████████████████████▍                                                                                                                                                                 | 901/5680 [2:36:33<10:27:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5665', 'grad_norm': '0.2068', 'learning_rate': '0.0001879', 'ppl': '1.762', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 7380992, 'tokens/trainable': 7310055, 'epoch': '1.043'}
 16%|██████████████████████████████▍                                                                                                                                                                 | 901/5680 [2:36:33<10:27:02,  7.87s/it] 16%|██████████████████████████████▍                                                                                                                                                                 | 902/5680 [2:36:41<10:26:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6699', 'grad_norm': '0.2156', 'learning_rate': '0.0001878', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 7389184, 'tokens/trainable': 7318236, 'epoch': '1.043'}
 16%|██████████████████████████████▍                                                                                                                                                                 | 902/5680 [2:36:41<10:26:21,  7.87s/it] 16%|██████████████████████████████▌                                                                                                                                                                 | 903/5680 [2:36:49<10:26:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.815', 'grad_norm': '0.2549', 'learning_rate': '0.0001878', 'ppl': '2.259', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7397376, 'tokens/trainable': 7326416, 'epoch': '1.043'}
 16%|██████████████████████████████▌                                                                                                                                                                 | 903/5680 [2:36:49<10:26:55,  7.87s/it] 16%|██████████████████████████████▌                                                                                                                                                                 | 904/5680 [2:36:57<10:26:58,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6784', 'grad_norm': '0.2706', 'learning_rate': '0.0001878', 'ppl': '1.971', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7405568, 'tokens/trainable': 7334567, 'epoch': '1.043'}
 16%|██████████████████████████████▌                                                                                                                                                                 | 904/5680 [2:36:57<10:26:58,  7.88s/it] 16%|██████████████████████████████▌                                                                                                                                                                 | 905/5680 [2:37:05<10:26:50,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7432', 'grad_norm': '0.242', 'learning_rate': '0.0001878', 'ppl': '2.103', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 7413760, 'tokens/trainable': 7342746, 'epoch': '1.043'}
 16%|██████████████████████████████▌                                                                                                                                                                 | 905/5680 [2:37:05<10:26:50,  7.88s/it] 16%|██████████████████████████████▋                                                                                                                                                                 | 906/5680 [2:37:13<10:28:02,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5564', 'grad_norm': '0.2161', 'learning_rate': '0.0001877', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7421952, 'tokens/trainable': 7350920, 'epoch': '1.043'}
 16%|██████████████████████████████▋                                                                                                                                                                 | 906/5680 [2:37:13<10:28:02,  7.89s/it] 16%|██████████████████████████████▋                                                                                                                                                                 | 907/5680 [2:37:21<10:26:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8724', 'grad_norm': '0.2459', 'learning_rate': '0.0001877', 'ppl': '2.393', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7430144, 'tokens/trainable': 7359060, 'epoch': '1.044'}
 16%|██████████████████████████████▋                                                                                                                                                                 | 907/5680 [2:37:21<10:26:53,  7.88s/it] 16%|██████████████████████████████▋                                                                                                                                                                 | 908/5680 [2:37:28<10:26:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9463', 'grad_norm': '0.252', 'learning_rate': '0.0001877', 'ppl': '2.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7438336, 'tokens/trainable': 7367206, 'epoch': '1.044'}
 16%|██████████████████████████████▋                                                                                                                                                                 | 908/5680 [2:37:28<10:26:17,  7.87s/it] 16%|██████████████████████████████▋                                                                                                                                                                 | 909/5680 [2:37:36<10:25:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7056', 'grad_norm': '0.2723', 'learning_rate': '0.0001877', 'ppl': '2.025', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 7446528, 'tokens/trainable': 7375358, 'epoch': '1.044'}
 16%|██████████████████████████████▋                                                                                                                                                                 | 909/5680 [2:37:36<10:25:23,  7.86s/it] 16%|██████████████████████████████▊                                                                                                                                                                 | 910/5680 [2:37:44<10:26:42,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9019', 'grad_norm': '0.2712', 'learning_rate': '0.0001876', 'ppl': '2.464', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7454720, 'tokens/trainable': 7383530, 'epoch': '1.044'}
 16%|██████████████████████████████▊                                                                                                                                                                 | 910/5680 [2:37:44<10:26:42,  7.88s/it] 16%|██████████████████████████████▊                                                                                                                                                                 | 911/5680 [2:37:52<10:26:47,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.761', 'grad_norm': '0.2401', 'learning_rate': '0.0001876', 'ppl': '2.14', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7462912, 'tokens/trainable': 7391689, 'epoch': '1.044'}
 16%|██████████████████████████████▊                                                                                                                                                                 | 911/5680 [2:37:52<10:26:47,  7.89s/it] 16%|██████████████████████████████▊                                                                                                                                                                 | 912/5680 [2:38:00<10:27:48,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5847', 'grad_norm': '0.2434', 'learning_rate': '0.0001876', 'ppl': '1.795', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 7471104, 'tokens/trainable': 7399823, 'epoch': '1.045'}
 16%|██████████████████████████████▊                                                                                                                                                                 | 912/5680 [2:38:00<10:27:48,  7.90s/it] 16%|██████████████████████████████▊                                                                                                                                                                 | 913/5680 [2:38:08<10:28:16,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.9648', 'grad_norm': '0.2447', 'learning_rate': '0.0001875', 'ppl': '2.624', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 7479296, 'tokens/trainable': 7407935, 'epoch': '1.045'}
 16%|██████████████████████████████▊                                                                                                                                                                 | 913/5680 [2:38:08<10:28:16,  7.91s/it] 16%|██████████████████████████████▉                                                                                                                                                                 | 914/5680 [2:38:16<10:29:01,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4903', 'grad_norm': '0.1996', 'learning_rate': '0.0001875', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 7487488, 'tokens/trainable': 7416055, 'epoch': '1.045'}
 16%|██████████████████████████████▉                                                                                                                                                                 | 914/5680 [2:38:16<10:29:01,  7.92s/it] 16%|██████████████████████████████▉                                                                                                                                                                 | 915/5680 [2:38:24<10:28:04,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.718', 'grad_norm': '0.2175', 'learning_rate': '0.0001875', 'ppl': '2.05', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7495680, 'tokens/trainable': 7424185, 'epoch': '1.045'}
 16%|██████████████████████████████▉                                                                                                                                                                 | 915/5680 [2:38:24<10:28:04,  7.91s/it] 16%|██████████████████████████████▉                                                                                                                                                                 | 916/5680 [2:38:32<10:26:28,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4807', 'grad_norm': '0.1991', 'learning_rate': '0.0001875', 'ppl': '1.617', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 7503872, 'tokens/trainable': 7432287, 'epoch': '1.045'}
 16%|██████████████████████████████▉                                                                                                                                                                 | 916/5680 [2:38:32<10:26:28,  7.89s/it] 16%|██████████████████████████████▉                                                                                                                                                                 | 917/5680 [2:38:40<10:25:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7744', 'grad_norm': '0.2828', 'learning_rate': '0.0001874', 'ppl': '2.169', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 7512064, 'tokens/trainable': 7440357, 'epoch': '1.045'}
 16%|██████████████████████████████▉                                                                                                                                                                 | 917/5680 [2:38:40<10:25:53,  7.88s/it] 16%|███████████████████████████████                                                                                                                                                                 | 918/5680 [2:38:47<10:24:58,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7908', 'grad_norm': '0.2223', 'learning_rate': '0.0001874', 'ppl': '2.205', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 7520256, 'tokens/trainable': 7448517, 'epoch': '1.046'}
 16%|███████████████████████████████                                                                                                                                                                 | 918/5680 [2:38:47<10:24:58,  7.87s/it] 16%|███████████████████████████████                                                                                                                                                                 | 919/5680 [2:38:55<10:24:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6966', 'grad_norm': '0.2292', 'learning_rate': '0.0001874', 'ppl': '2.007', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 7528448, 'tokens/trainable': 7456648, 'epoch': '1.046'}
 16%|███████████████████████████████                                                                                                                                                                 | 919/5680 [2:38:55<10:24:25,  7.87s/it] 16%|███████████████████████████████                                                                                                                                                                 | 920/5680 [2:39:03<10:24:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6161', 'grad_norm': '0.2116', 'learning_rate': '0.0001874', 'ppl': '1.852', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 7536640, 'tokens/trainable': 7464803, 'epoch': '1.046'}
 16%|███████████████████████████████                                                                                                                                                                 | 920/5680 [2:39:03<10:24:37,  7.87s/it] 16%|███████████████████████████████▏                                                                                                                                                                | 921/5680 [2:39:11<10:24:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8512', 'grad_norm': '0.2368', 'learning_rate': '0.0001873', 'ppl': '2.342', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 7544832, 'tokens/trainable': 7472969, 'epoch': '1.046'}
 16%|███████████████████████████████▏                                                                                                                                                                | 921/5680 [2:39:11<10:24:18,  7.87s/it] 16%|███████████████████████████████▏                                                                                                                                                                | 922/5680 [2:39:19<10:27:08,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6092', 'grad_norm': '0.2162', 'learning_rate': '0.0001873', 'ppl': '1.839', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 7553024, 'tokens/trainable': 7481142, 'epoch': '1.046'}
 16%|███████████████████████████████▏                                                                                                                                                                | 922/5680 [2:39:19<10:27:08,  7.91s/it] 16%|███████████████████████████████▏                                                                                                                                                                | 923/5680 [2:39:27<10:27:16,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7293', 'grad_norm': '0.2439', 'learning_rate': '0.0001873', 'ppl': '2.074', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 7561216, 'tokens/trainable': 7489322, 'epoch': '1.046'}
 16%|███████████████████████████████▏                                                                                                                                                                | 923/5680 [2:39:27<10:27:16,  7.91s/it] 16%|███████████████████████████████▏                                                                                                                                                                | 924/5680 [2:39:35<10:27:45,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6544', 'grad_norm': '0.2534', 'learning_rate': '0.0001872', 'ppl': '1.924', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 7569408, 'tokens/trainable': 7497476, 'epoch': '1.047'}
 16%|███████████████████████████████▏                                                                                                                                                                | 924/5680 [2:39:35<10:27:45,  7.92s/it] 16%|███████████████████████████████▎                                                                                                                                                                | 925/5680 [2:39:43<10:27:53,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6626', 'grad_norm': '0.2382', 'learning_rate': '0.0001872', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 7577600, 'tokens/trainable': 7505588, 'epoch': '1.047'}
 16%|███████████████████████████████▎                                                                                                                                                                | 925/5680 [2:39:43<10:27:53,  7.92s/it] 16%|███████████████████████████████▎                                                                                                                                                                | 926/5680 [2:39:51<10:26:25,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6863', 'grad_norm': '0.2211', 'learning_rate': '0.0001872', 'ppl': '1.986', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 7585792, 'tokens/trainable': 7513753, 'epoch': '1.047'}
 16%|███████████████████████████████▎                                                                                                                                                                | 926/5680 [2:39:51<10:26:25,  7.91s/it] 16%|███████████████████████████████▎                                                                                                                                                                | 927/5680 [2:39:58<10:24:58,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8176', 'grad_norm': '0.2335', 'learning_rate': '0.0001872', 'ppl': '2.265', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 7593984, 'tokens/trainable': 7521887, 'epoch': '1.047'}
 16%|███████████████████████████████▎                                                                                                                                                                | 927/5680 [2:39:58<10:24:58,  7.89s/it] 16%|███████████████████████████████▎                                                                                                                                                                | 928/5680 [2:40:06<10:22:58,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6096', 'grad_norm': '0.2259', 'learning_rate': '0.0001871', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7602176, 'tokens/trainable': 7529964, 'epoch': '1.047'}
 16%|███████████████████████████████▎                                                                                                                                                                | 928/5680 [2:40:06<10:22:58,  7.87s/it] 16%|███████████████████████████████▍                                                                                                                                                                | 929/5680 [2:40:14<10:23:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5585', 'grad_norm': '0.2572', 'learning_rate': '0.0001871', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 7610368, 'tokens/trainable': 7538074, 'epoch': '1.048'}
 16%|███████████████████████████████▍                                                                                                                                                                | 929/5680 [2:40:14<10:23:09,  7.87s/it] 16%|███████████████████████████████▍                                                                                                                                                                | 930/5680 [2:40:22<10:23:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7219', 'grad_norm': '0.2423', 'learning_rate': '0.0001871', 'ppl': '2.058', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 7618560, 'tokens/trainable': 7546170, 'epoch': '1.048'}
 16%|███████████████████████████████▍                                                                                                                                                                | 930/5680 [2:40:22<10:23:27,  7.88s/it] 16%|███████████████████████████████▍                                                                                                                                                                | 931/5680 [2:40:30<10:22:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8744', 'grad_norm': '0.2913', 'learning_rate': '0.0001871', 'ppl': '2.398', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7626752, 'tokens/trainable': 7554266, 'epoch': '1.048'}
 16%|███████████████████████████████▍                                                                                                                                                                | 931/5680 [2:40:30<10:22:51,  7.87s/it] 16%|███████████████████████████████▌                                                                                                                                                                | 932/5680 [2:40:38<10:22:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8816', 'grad_norm': '0.2727', 'learning_rate': '0.000187', 'ppl': '2.415', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7634944, 'tokens/trainable': 7562408, 'epoch': '1.048'}
 16%|███████████████████████████████▌                                                                                                                                                                | 932/5680 [2:40:38<10:22:53,  7.87s/it] 16%|███████████████████████████████▌                                                                                                                                                                | 933/5680 [2:40:46<10:23:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5822', 'grad_norm': '0.2421', 'learning_rate': '0.000187', 'ppl': '1.79', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 7643136, 'tokens/trainable': 7570538, 'epoch': '1.048'}
 16%|███████████████████████████████▌                                                                                                                                                                | 933/5680 [2:40:46<10:23:37,  7.88s/it] 16%|███████████████████████████████▌                                                                                                                                                                | 934/5680 [2:40:54<10:23:07,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7151', 'grad_norm': '0.235', 'learning_rate': '0.000187', 'ppl': '2.044', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7651328, 'tokens/trainable': 7578673, 'epoch': '1.048'}
 16%|███████████████████████████████▌                                                                                                                                                                | 934/5680 [2:40:54<10:23:07,  7.88s/it] 16%|███████████████████████████████▌                                                                                                                                                                | 935/5680 [2:41:01<10:21:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6568', 'grad_norm': '0.2323', 'learning_rate': '0.000187', 'ppl': '1.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7659520, 'tokens/trainable': 7586789, 'epoch': '1.049'}
 16%|███████████████████████████████▌                                                                                                                                                                | 935/5680 [2:41:01<10:21:46,  7.86s/it] 16%|███████████████████████████████▋                                                                                                                                                                | 936/5680 [2:41:09<10:22:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7216', 'grad_norm': '0.2292', 'learning_rate': '0.0001869', 'ppl': '2.058', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 7667712, 'tokens/trainable': 7594884, 'epoch': '1.049'}
 16%|███████████████████████████████▋                                                                                                                                                                | 936/5680 [2:41:09<10:22:20,  7.87s/it] 16%|███████████████████████████████▋                                                                                                                                                                | 937/5680 [2:41:17<10:23:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4617', 'grad_norm': '0.1818', 'learning_rate': '0.0001869', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7675904, 'tokens/trainable': 7603061, 'epoch': '1.049'}
 16%|███████████████████████████████▋                                                                                                                                                                | 937/5680 [2:41:17<10:23:44,  7.89s/it] 17%|███████████████████████████████▋                                                                                                                                                                | 938/5680 [2:41:25<10:23:23,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8618', 'grad_norm': '0.252', 'learning_rate': '0.0001869', 'ppl': '2.367', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 7684096, 'tokens/trainable': 7611225, 'epoch': '1.049'}
 17%|███████████████████████████████▋                                                                                                                                                                | 938/5680 [2:41:25<10:23:23,  7.89s/it] 17%|███████████████████████████████▋                                                                                                                                                                | 939/5680 [2:41:33<10:23:32,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7809', 'grad_norm': '0.2443', 'learning_rate': '0.0001868', 'ppl': '2.183', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7692288, 'tokens/trainable': 7619372, 'epoch': '1.049'}
 17%|███████████████████████████████▋                                                                                                                                                                | 939/5680 [2:41:33<10:23:32,  7.89s/it] 17%|███████████████████████████████▊                                                                                                                                                                | 940/5680 [2:41:41<10:23:26,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5894', 'grad_norm': '0.2195', 'learning_rate': '0.0001868', 'ppl': '1.803', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7700480, 'tokens/trainable': 7627505, 'epoch': '1.049'}
 17%|███████████████████████████████▊                                                                                                                                                                | 940/5680 [2:41:41<10:23:26,  7.89s/it] 17%|███████████████████████████████▊                                                                                                                                                                | 941/5680 [2:41:49<10:24:01,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.9366', 'grad_norm': '0.2642', 'learning_rate': '0.0001868', 'ppl': '2.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7708672, 'tokens/trainable': 7635676, 'epoch': '1.05'}
 17%|███████████████████████████████▊                                                                                                                                                                | 941/5680 [2:41:49<10:24:01,  7.90s/it] 17%|███████████████████████████████▊                                                                                                                                                                | 942/5680 [2:41:57<10:24:15,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6327', 'grad_norm': '0.2107', 'learning_rate': '0.0001868', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 7716864, 'tokens/trainable': 7643800, 'epoch': '1.05'}
 17%|███████████████████████████████▊                                                                                                                                                                | 942/5680 [2:41:57<10:24:15,  7.91s/it] 17%|███████████████████████████████▉                                                                                                                                                                | 943/5680 [2:42:05<10:24:28,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8028', 'grad_norm': '0.2346', 'learning_rate': '0.0001867', 'ppl': '2.232', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7725056, 'tokens/trainable': 7651961, 'epoch': '1.05'}
 17%|███████████████████████████████▉                                                                                                                                                                | 943/5680 [2:42:05<10:24:28,  7.91s/it] 17%|███████████████████████████████▉                                                                                                                                                                | 944/5680 [2:42:13<10:25:15,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6744', 'grad_norm': '0.226', 'learning_rate': '0.0001867', 'ppl': '1.963', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 7733248, 'tokens/trainable': 7660127, 'epoch': '1.05'}
 17%|███████████████████████████████▉                                                                                                                                                                | 944/5680 [2:42:13<10:25:15,  7.92s/it] 17%|███████████████████████████████▉                                                                                                                                                                | 945/5680 [2:42:20<10:24:10,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6565', 'grad_norm': '0.248', 'learning_rate': '0.0001867', 'ppl': '1.928', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7741440, 'tokens/trainable': 7668295, 'epoch': '1.05'}
 17%|███████████████████████████████▉                                                                                                                                                                | 945/5680 [2:42:20<10:24:10,  7.91s/it] 17%|███████████████████████████████▉                                                                                                                                                                | 946/5680 [2:42:28<10:23:51,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6524', 'grad_norm': '0.2265', 'learning_rate': '0.0001866', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 7749632, 'tokens/trainable': 7676419, 'epoch': '1.051'}
 17%|███████████████████████████████▉                                                                                                                                                                | 946/5680 [2:42:28<10:23:51,  7.91s/it] 17%|████████████████████████████████                                                                                                                                                                | 947/5680 [2:42:36<10:23:45,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7743', 'grad_norm': '0.2369', 'learning_rate': '0.0001866', 'ppl': '2.169', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7757824, 'tokens/trainable': 7684597, 'epoch': '1.051'}
 17%|████████████████████████████████                                                                                                                                                                | 947/5680 [2:42:36<10:23:45,  7.91s/it] 17%|████████████████████████████████                                                                                                                                                                | 948/5680 [2:42:44<10:23:50,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.9184', 'grad_norm': '0.2533', 'learning_rate': '0.0001866', 'ppl': '2.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 7766016, 'tokens/trainable': 7692734, 'epoch': '1.051'}
 17%|████████████████████████████████                                                                                                                                                                | 948/5680 [2:42:44<10:23:50,  7.91s/it] 17%|████████████████████████████████                                                                                                                                                                | 949/5680 [2:42:52<10:23:54,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.9653', 'grad_norm': '0.2448', 'learning_rate': '0.0001866', 'ppl': '2.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 7774208, 'tokens/trainable': 7700847, 'epoch': '1.051'}
 17%|████████████████████████████████                                                                                                                                                                | 949/5680 [2:42:52<10:23:54,  7.91s/it] 17%|████████████████████████████████                                                                                                                                                                | 950/5680 [2:43:00<10:22:10,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8195', 'grad_norm': '0.2515', 'learning_rate': '0.0001865', 'ppl': '2.269', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7782400, 'tokens/trainable': 7708981, 'epoch': '1.051'}
 17%|████████████████████████████████                                                                                                                                                                | 950/5680 [2:43:00<10:22:10,  7.89s/it] 17%|████████████████████████████████▏                                                                                                                                                               | 951/5680 [2:43:08<10:21:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8636', 'grad_norm': '0.2997', 'learning_rate': '0.0001865', 'ppl': '2.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7790592, 'tokens/trainable': 7717083, 'epoch': '1.051'}
 17%|████████████████████████████████▏                                                                                                                                                               | 951/5680 [2:43:08<10:21:19,  7.88s/it] 17%|████████████████████████████████▏                                                                                                                                                               | 952/5680 [2:43:16<10:20:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6248', 'grad_norm': '0.2317', 'learning_rate': '0.0001865', 'ppl': '1.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7798784, 'tokens/trainable': 7725229, 'epoch': '1.052'}
 17%|████████████████████████████████▏                                                                                                                                                               | 952/5680 [2:43:16<10:20:41,  7.88s/it] 17%|████████████████████████████████▏                                                                                                                                                               | 953/5680 [2:43:24<10:27:51,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7292', 'grad_norm': '0.2199', 'learning_rate': '0.0001865', 'ppl': '2.074', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999', 'tokens/total': 7806976, 'tokens/trainable': 7733404, 'epoch': '1.052'}
 17%|████████████████████████████████▏                                                                                                                                                               | 953/5680 [2:43:24<10:27:51,  7.97s/it] 17%|████████████████████████████████▏                                                                                                                                                               | 954/5680 [2:43:32<10:25:27,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.641', 'grad_norm': '0.2212', 'learning_rate': '0.0001864', 'ppl': '1.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 7815168, 'tokens/trainable': 7741534, 'epoch': '1.052'}
 17%|████████████████████████████████▏                                                                                                                                                               | 954/5680 [2:43:32<10:25:27,  7.94s/it] 17%|████████████████████████████████▎                                                                                                                                                               | 955/5680 [2:43:40<10:24:03,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8095', 'grad_norm': '0.249', 'learning_rate': '0.0001864', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7823360, 'tokens/trainable': 7749654, 'epoch': '1.052'}
 17%|████████████████████████████████▎                                                                                                                                                               | 955/5680 [2:43:40<10:24:03,  7.92s/it] 17%|████████████████████████████████▎                                                                                                                                                               | 956/5680 [2:43:48<10:22:18,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6461', 'grad_norm': '0.2422', 'learning_rate': '0.0001864', 'ppl': '1.908', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 7831552, 'tokens/trainable': 7757789, 'epoch': '1.052'}
 17%|████████████████████████████████▎                                                                                                                                                               | 956/5680 [2:43:48<10:22:18,  7.90s/it] 17%|████████████████████████████████▎                                                                                                                                                               | 957/5680 [2:43:55<10:20:33,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7924', 'grad_norm': '0.2812', 'learning_rate': '0.0001863', 'ppl': '2.209', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 7839744, 'tokens/trainable': 7765971, 'epoch': '1.052'}
 17%|████████████████████████████████▎                                                                                                                                                               | 957/5680 [2:43:55<10:20:33,  7.88s/it] 17%|████████████████████████████████▍                                                                                                                                                               | 958/5680 [2:44:03<10:21:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5809', 'grad_norm': '0.2438', 'learning_rate': '0.0001863', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7847936, 'tokens/trainable': 7774133, 'epoch': '1.053'}
 17%|████████████████████████████████▍                                                                                                                                                               | 958/5680 [2:44:03<10:21:07,  7.89s/it] 17%|████████████████████████████████▍                                                                                                                                                               | 959/5680 [2:44:11<10:20:31,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8096', 'grad_norm': '0.2493', 'learning_rate': '0.0001863', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 7856128, 'tokens/trainable': 7782322, 'epoch': '1.053'}
 17%|████████████████████████████████▍                                                                                                                                                               | 959/5680 [2:44:11<10:20:31,  7.89s/it] 17%|████████████████████████████████▍                                                                                                                                                               | 960/5680 [2:44:19<10:19:42,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7651', 'grad_norm': '0.2438', 'learning_rate': '0.0001863', 'ppl': '2.149', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 7864320, 'tokens/trainable': 7790484, 'epoch': '1.053'}
 17%|████████████████████████████████▍                                                                                                                                                               | 960/5680 [2:44:19<10:19:42,  7.88s/it] 17%|████████████████████████████████▍                                                                                                                                                               | 961/5680 [2:44:27<10:19:58,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.506', 'grad_norm': '0.2065', 'learning_rate': '0.0001862', 'ppl': '1.659', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7872512, 'tokens/trainable': 7798648, 'epoch': '1.053'}
 17%|████████████████████████████████▍                                                                                                                                                               | 961/5680 [2:44:27<10:19:58,  7.88s/it] 17%|████████████████████████████████▌                                                                                                                                                               | 962/5680 [2:44:35<10:20:06,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8009', 'grad_norm': '0.251', 'learning_rate': '0.0001862', 'ppl': '2.228', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 7880704, 'tokens/trainable': 7806809, 'epoch': '1.053'}
 17%|████████████████████████████████▌                                                                                                                                                               | 962/5680 [2:44:35<10:20:06,  7.89s/it] 17%|████████████████████████████████▌                                                                                                                                                               | 963/5680 [2:44:43<10:21:34,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.9924', 'grad_norm': '0.256', 'learning_rate': '0.0001862', 'ppl': '2.698', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 7888896, 'tokens/trainable': 7814916, 'epoch': '1.054'}
 17%|████████████████████████████████▌                                                                                                                                                               | 963/5680 [2:44:43<10:21:34,  7.91s/it] 17%|████████████████████████████████▌                                                                                                                                                               | 964/5680 [2:44:51<10:20:00,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.012', 'grad_norm': '0.2904', 'learning_rate': '0.0001861', 'ppl': '2.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 7897088, 'tokens/trainable': 7823076, 'epoch': '1.054'}
 17%|████████████████████████████████▌                                                                                                                                                               | 964/5680 [2:44:51<10:20:00,  7.89s/it] 17%|████████████████████████████████▌                                                                                                                                                               | 965/5680 [2:44:58<10:18:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7432', 'grad_norm': '0.2365', 'learning_rate': '0.0001861', 'ppl': '2.103', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 7905280, 'tokens/trainable': 7831203, 'epoch': '1.054'}
 17%|████████████████████████████████▌                                                                                                                                                               | 965/5680 [2:44:58<10:18:01,  7.86s/it] 17%|████████████████████████████████▋                                                                                                                                                               | 966/5680 [2:45:06<10:17:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7372', 'grad_norm': '0.2452', 'learning_rate': '0.0001861', 'ppl': '2.09', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 7913472, 'tokens/trainable': 7839360, 'epoch': '1.054'}
 17%|████████████████████████████████▋                                                                                                                                                               | 966/5680 [2:45:06<10:17:21,  7.86s/it] 17%|████████████████████████████████▋                                                                                                                                                               | 967/5680 [2:45:14<10:17:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6572', 'grad_norm': '0.2356', 'learning_rate': '0.0001861', 'ppl': '1.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 7921664, 'tokens/trainable': 7847486, 'epoch': '1.054'}
 17%|████████████████████████████████▋                                                                                                                                                               | 967/5680 [2:45:14<10:17:28,  7.86s/it] 17%|████████████████████████████████▋                                                                                                                                                               | 968/5680 [2:45:22<10:17:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7447', 'grad_norm': '0.2423', 'learning_rate': '0.000186', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 7929856, 'tokens/trainable': 7855643, 'epoch': '1.054'}
 17%|████████████████████████████████▋                                                                                                                                                               | 968/5680 [2:45:22<10:17:30,  7.86s/it] 17%|████████████████████████████████▊                                                                                                                                                               | 969/5680 [2:45:30<10:17:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8436', 'grad_norm': '0.2462', 'learning_rate': '0.000186', 'ppl': '2.325', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7938048, 'tokens/trainable': 7863750, 'epoch': '1.055'}
 17%|████████████████████████████████▊                                                                                                                                                               | 969/5680 [2:45:30<10:17:23,  7.86s/it] 17%|████████████████████████████████▊                                                                                                                                                               | 970/5680 [2:45:38<10:17:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6502', 'grad_norm': '0.2348', 'learning_rate': '0.000186', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 7946240, 'tokens/trainable': 7871927, 'epoch': '1.055'}
 17%|████████████████████████████████▊                                                                                                                                                               | 970/5680 [2:45:38<10:17:57,  7.87s/it] 17%|████████████████████████████████▊                                                                                                                                                               | 971/5680 [2:45:46<10:19:13,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5487', 'grad_norm': '0.2214', 'learning_rate': '0.000186', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 7954432, 'tokens/trainable': 7880077, 'epoch': '1.055'}
 17%|████████████████████████████████▊                                                                                                                                                               | 971/5680 [2:45:46<10:19:13,  7.89s/it] 17%|████████████████████████████████▊                                                                                                                                                               | 972/5680 [2:45:54<10:18:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7717', 'grad_norm': '0.2706', 'learning_rate': '0.0001859', 'ppl': '2.163', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 7962624, 'tokens/trainable': 7888223, 'epoch': '1.055'}
 17%|████████████████████████████████▊                                                                                                                                                               | 972/5680 [2:45:54<10:18:40,  7.88s/it] 17%|████████████████████████████████▉                                                                                                                                                               | 973/5680 [2:46:01<10:18:09,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7601', 'grad_norm': '0.2326', 'learning_rate': '0.0001859', 'ppl': '2.139', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 7970816, 'tokens/trainable': 7896367, 'epoch': '1.055'}
 17%|████████████████████████████████▉                                                                                                                                                               | 973/5680 [2:46:01<10:18:09,  7.88s/it] 17%|████████████████████████████████▉                                                                                                                                                               | 974/5680 [2:46:09<10:17:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7574', 'grad_norm': '0.2583', 'learning_rate': '0.0001859', 'ppl': '2.133', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 7979008, 'tokens/trainable': 7904555, 'epoch': '1.055'}
 17%|████████████████████████████████▉                                                                                                                                                               | 974/5680 [2:46:09<10:17:30,  7.87s/it] 17%|████████████████████████████████▉                                                                                                                                                               | 975/5680 [2:46:17<10:17:55,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.635', 'grad_norm': '0.3576', 'learning_rate': '0.0001858', 'ppl': '1.887', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 7987200, 'tokens/trainable': 7912693, 'epoch': '1.056'}
 17%|████████████████████████████████▉                                                                                                                                                               | 975/5680 [2:46:17<10:17:55,  7.88s/it] 17%|████████████████████████████████▉                                                                                                                                                               | 976/5680 [2:46:25<10:17:38,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6084', 'grad_norm': '0.229', 'learning_rate': '0.0001858', 'ppl': '1.838', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 7995392, 'tokens/trainable': 7920819, 'epoch': '1.056'}
 17%|████████████████████████████████▉                                                                                                                                                               | 976/5680 [2:46:25<10:17:38,  7.88s/it] 17%|█████████████████████████████████                                                                                                                                                               | 977/5680 [2:46:33<10:15:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7266', 'grad_norm': '0.2505', 'learning_rate': '0.0001858', 'ppl': '2.068', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 8003584, 'tokens/trainable': 7928978, 'epoch': '1.056'}
 17%|█████████████████████████████████                                                                                                                                                               | 977/5680 [2:46:33<10:15:59,  7.86s/it] 17%|█████████████████████████████████                                                                                                                                                               | 978/5680 [2:46:41<10:15:14,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4741', 'grad_norm': '0.2144', 'learning_rate': '0.0001858', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 8011776, 'tokens/trainable': 7937119, 'epoch': '1.056'}
 17%|█████████████████████████████████                                                                                                                                                               | 978/5680 [2:46:41<10:15:14,  7.85s/it] 17%|█████████████████████████████████                                                                                                                                                               | 979/5680 [2:46:49<10:15:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8536', 'grad_norm': '0.263', 'learning_rate': '0.0001857', 'ppl': '2.348', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8019968, 'tokens/trainable': 7945298, 'epoch': '1.056'}
 17%|█████████████████████████████████                                                                                                                                                               | 979/5680 [2:46:49<10:15:04,  7.85s/it] 17%|█████████████████████████████████▏                                                                                                                                                              | 980/5680 [2:46:56<10:15:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7744', 'grad_norm': '0.2407', 'learning_rate': '0.0001857', 'ppl': '2.169', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 8028160, 'tokens/trainable': 7953403, 'epoch': '1.057'}
 17%|█████████████████████████████████▏                                                                                                                                                              | 980/5680 [2:46:56<10:15:30,  7.86s/it] 17%|█████████████████████████████████▏                                                                                                                                                              | 981/5680 [2:47:04<10:15:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6605', 'grad_norm': '0.2105', 'learning_rate': '0.0001857', 'ppl': '1.936', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 8036352, 'tokens/trainable': 7961557, 'epoch': '1.057'}
 17%|█████████████████████████████████▏                                                                                                                                                              | 981/5680 [2:47:04<10:15:33,  7.86s/it] 17%|█████████████████████████████████▏                                                                                                                                                              | 982/5680 [2:47:12<10:15:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.62', 'grad_norm': '0.2171', 'learning_rate': '0.0001856', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8044544, 'tokens/trainable': 7969723, 'epoch': '1.057'}
 17%|█████████████████████████████████▏                                                                                                                                                              | 982/5680 [2:47:12<10:15:35,  7.86s/it] 17%|█████████████████████████████████▏                                                                                                                                                              | 983/5680 [2:47:20<10:16:54,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4756', 'grad_norm': '0.205', 'learning_rate': '0.0001856', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 8052736, 'tokens/trainable': 7977877, 'epoch': '1.057'}
 17%|█████████████████████████████████▏                                                                                                                                                              | 983/5680 [2:47:20<10:16:54,  7.88s/it] 17%|█████████████████████████████████▎                                                                                                                                                              | 984/5680 [2:47:28<10:17:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6324', 'grad_norm': '0.2245', 'learning_rate': '0.0001856', 'ppl': '1.882', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8060928, 'tokens/trainable': 7986011, 'epoch': '1.057'}
 17%|█████████████████████████████████▎                                                                                                                                                              | 984/5680 [2:47:28<10:17:05,  7.88s/it] 17%|█████████████████████████████████▎                                                                                                                                                              | 985/5680 [2:47:36<10:14:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6649', 'grad_norm': '0.2376', 'learning_rate': '0.0001856', 'ppl': '1.944', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8069120, 'tokens/trainable': 7994084, 'epoch': '1.057'}
 17%|█████████████████████████████████▎                                                                                                                                                              | 985/5680 [2:47:36<10:14:59,  7.86s/it] 17%|█████████████████████████████████▎                                                                                                                                                              | 986/5680 [2:47:44<10:15:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6733', 'grad_norm': '0.2449', 'learning_rate': '0.0001855', 'ppl': '1.961', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8077312, 'tokens/trainable': 8002263, 'epoch': '1.058'}
 17%|█████████████████████████████████▎                                                                                                                                                              | 986/5680 [2:47:44<10:15:43,  7.87s/it] 17%|█████████████████████████████████▎                                                                                                                                                              | 987/5680 [2:47:51<10:14:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6565', 'grad_norm': '0.2255', 'learning_rate': '0.0001855', 'ppl': '1.928', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8085504, 'tokens/trainable': 8010430, 'epoch': '1.058'}
 17%|█████████████████████████████████▎                                                                                                                                                              | 987/5680 [2:47:51<10:14:52,  7.86s/it] 17%|█████████████████████████████████▍                                                                                                                                                              | 988/5680 [2:47:59<10:15:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3968', 'grad_norm': '0.1823', 'learning_rate': '0.0001855', 'ppl': '1.487', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8093696, 'tokens/trainable': 8018595, 'epoch': '1.058'}
 17%|█████████████████████████████████▍                                                                                                                                                              | 988/5680 [2:47:59<10:15:31,  7.87s/it] 17%|█████████████████████████████████▍                                                                                                                                                              | 989/5680 [2:48:07<10:16:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6912', 'grad_norm': '0.2532', 'learning_rate': '0.0001854', 'ppl': '1.996', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 8101888, 'tokens/trainable': 8026699, 'epoch': '1.058'}
 17%|█████████████████████████████████▍                                                                                                                                                              | 989/5680 [2:48:07<10:16:18,  7.88s/it] 17%|█████████████████████████████████▍                                                                                                                                                              | 990/5680 [2:48:15<10:17:03,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5975', 'grad_norm': '0.2682', 'learning_rate': '0.0001854', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 8110080, 'tokens/trainable': 8034838, 'epoch': '1.058'}
 17%|█████████████████████████████████▍                                                                                                                                                              | 990/5680 [2:48:15<10:17:03,  7.89s/it] 17%|█████████████████████████████████▍                                                                                                                                                              | 991/5680 [2:48:23<10:16:19,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6552', 'grad_norm': '0.2508', 'learning_rate': '0.0001854', 'ppl': '1.926', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 8118272, 'tokens/trainable': 8042975, 'epoch': '1.058'}
 17%|█████████████████████████████████▍                                                                                                                                                              | 991/5680 [2:48:23<10:16:19,  7.89s/it] 17%|█████████████████████████████████▌                                                                                                                                                              | 992/5680 [2:48:31<10:16:04,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6519', 'grad_norm': '0.2653', 'learning_rate': '0.0001854', 'ppl': '1.919', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8126464, 'tokens/trainable': 8051095, 'epoch': '1.059'}
 17%|█████████████████████████████████▌                                                                                                                                                              | 992/5680 [2:48:31<10:16:04,  7.88s/it] 17%|█████████████████████████████████▌                                                                                                                                                              | 993/5680 [2:48:39<10:16:16,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6638', 'grad_norm': '0.24', 'learning_rate': '0.0001853', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8134656, 'tokens/trainable': 8059254, 'epoch': '1.059'}
 17%|█████████████████████████████████▌                                                                                                                                                              | 993/5680 [2:48:39<10:16:16,  7.89s/it] 18%|█████████████████████████████████▌                                                                                                                                                              | 994/5680 [2:48:47<10:14:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8415', 'grad_norm': '0.2742', 'learning_rate': '0.0001853', 'ppl': '2.32', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 8142848, 'tokens/trainable': 8067423, 'epoch': '1.059'}
 18%|█████████████████████████████████▌                                                                                                                                                              | 994/5680 [2:48:47<10:14:56,  7.87s/it] 18%|█████████████████████████████████▋                                                                                                                                                              | 995/5680 [2:48:55<10:14:55,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6939', 'grad_norm': '0.2242', 'learning_rate': '0.0001853', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 8151040, 'tokens/trainable': 8075532, 'epoch': '1.059'}
 18%|█████████████████████████████████▋                                                                                                                                                              | 995/5680 [2:48:55<10:14:55,  7.88s/it] 18%|█████████████████████████████████▋                                                                                                                                                              | 996/5680 [2:49:02<10:14:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6528', 'grad_norm': '0.2312', 'learning_rate': '0.0001852', 'ppl': '1.921', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8159232, 'tokens/trainable': 8083672, 'epoch': '1.059'}
 18%|█████████████████████████████████▋                                                                                                                                                              | 996/5680 [2:49:02<10:14:36,  7.87s/it] 18%|█████████████████████████████████▋                                                                                                                                                              | 997/5680 [2:49:10<10:14:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6676', 'grad_norm': '0.2617', 'learning_rate': '0.0001852', 'ppl': '1.949', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8167424, 'tokens/trainable': 8091815, 'epoch': '1.06'}
 18%|█████████████████████████████████▋                                                                                                                                                              | 997/5680 [2:49:10<10:14:13,  7.87s/it] 18%|█████████████████████████████████▋                                                                                                                                                              | 998/5680 [2:49:18<10:13:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7091', 'grad_norm': '0.2433', 'learning_rate': '0.0001852', 'ppl': '2.032', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8175616, 'tokens/trainable': 8099934, 'epoch': '1.06'}
 18%|█████████████████████████████████▋                                                                                                                                                              | 998/5680 [2:49:18<10:13:50,  7.87s/it] 18%|█████████████████████████████████▊                                                                                                                                                              | 999/5680 [2:49:26<10:13:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.822', 'grad_norm': '0.2772', 'learning_rate': '0.0001851', 'ppl': '2.275', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 8183808, 'tokens/trainable': 8108060, 'epoch': '1.06'}
 18%|█████████████████████████████████▊                                                                                                                                                              | 999/5680 [2:49:26<10:13:06,  7.86s/it] 18%|█████████████████████████████████▋                                                                                                                                                             | 1000/5680 [2:49:34<10:13:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.617', 'grad_norm': '0.2351', 'learning_rate': '0.0001851', 'ppl': '1.853', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 8192000, 'tokens/trainable': 8116127, 'epoch': '1.06'}
 18%|█████████████████████████████████▋                                                                                                                                                             | 1000/5680 [2:49:34<10:13:22,  7.86s/it][2026-01-27 00:38:47,927] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 00:38:47,928] [WARNING] [py.warnings._showwarnmsg:109] [PID:58142] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 00:39:31,937] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out/checkpoint-1000
[2026-01-27 00:40:25,213] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 00:40:25,213] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

[2026-01-27 00:40:25,214] [WARNING] [py.warnings._showwarnmsg:109] [PID:58142] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 00:40:25,214] [WARNING] [py.warnings._showwarnmsg:109] [PID:58142] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

 18%|█████████████████████████████████▋                                                                                                                                                             | 1001/5680 [2:51:19<48:16:53, 37.15s/it]                                                                                                                                                                                                                                             {'loss': '0.6832', 'grad_norm': '0.2579', 'learning_rate': '0.0001851', 'ppl': '1.98', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 8200192, 'tokens/trainable': 8124307, 'epoch': '1.06'}
 18%|█████████████████████████████████▋                                                                                                                                                             | 1001/5680 [2:51:19<48:16:53, 37.15s/it] 18%|█████████████████████████████████▋                                                                                                                                                             | 1002/5680 [2:51:27<36:50:01, 28.35s/it]                                                                                                                                                                                                                                             {'loss': '0.7158', 'grad_norm': '0.2297', 'learning_rate': '0.0001851', 'ppl': '2.046', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 8208384, 'tokens/trainable': 8132476, 'epoch': '1.06'}
 18%|█████████████████████████████████▋                                                                                                                                                             | 1002/5680 [2:51:27<36:50:01, 28.35s/it] 18%|█████████████████████████████████▋                                                                                                                                                             | 1003/5680 [2:51:35<28:50:32, 22.20s/it]                                                                                                                                                                                                                                             {'loss': '0.732', 'grad_norm': '0.2305', 'learning_rate': '0.000185', 'ppl': '2.079', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8216576, 'tokens/trainable': 8140575, 'epoch': '1.061'}
 18%|█████████████████████████████████▋                                                                                                                                                             | 1003/5680 [2:51:35<28:50:32, 22.20s/it] 18%|█████████████████████████████████▊                                                                                                                                                             | 1004/5680 [2:51:43<23:15:52, 17.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6278', 'grad_norm': '0.2238', 'learning_rate': '0.000185', 'ppl': '1.873', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8224768, 'tokens/trainable': 8148720, 'epoch': '1.061'}
 18%|█████████████████████████████████▊                                                                                                                                                             | 1004/5680 [2:51:43<23:15:52, 17.91s/it] 18%|█████████████████████████████████▊                                                                                                                                                             | 1005/5680 [2:51:51<19:20:54, 14.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6552', 'grad_norm': '0.2273', 'learning_rate': '0.000185', 'ppl': '1.925', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8232960, 'tokens/trainable': 8156870, 'epoch': '1.061'}
 18%|█████████████████████████████████▊                                                                                                                                                             | 1005/5680 [2:51:51<19:20:54, 14.90s/it] 18%|█████████████████████████████████▊                                                                                                                                                             | 1006/5680 [2:51:59<16:36:23, 12.79s/it]                                                                                                                                                                                                                                             {'loss': '0.792', 'grad_norm': '0.2395', 'learning_rate': '0.0001849', 'ppl': '2.208', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8241152, 'tokens/trainable': 8165035, 'epoch': '1.061'}
 18%|█████████████████████████████████▊                                                                                                                                                             | 1006/5680 [2:51:59<16:36:23, 12.79s/it] 18%|█████████████████████████████████▊                                                                                                                                                             | 1007/5680 [2:52:07<14:42:15, 11.33s/it]                                                                                                                                                                                                                                             {'loss': '0.7405', 'grad_norm': '0.2406', 'learning_rate': '0.0001849', 'ppl': '2.097', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 8249344, 'tokens/trainable': 8173179, 'epoch': '1.061'}
 18%|█████████████████████████████████▊                                                                                                                                                             | 1007/5680 [2:52:07<14:42:15, 11.33s/it] 18%|█████████████████████████████████▉                                                                                                                                                             | 1008/5680 [2:52:14<13:21:08, 10.29s/it]                                                                                                                                                                                                                                             {'loss': '0.974', 'grad_norm': '0.2878', 'learning_rate': '0.0001849', 'ppl': '2.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8257536, 'tokens/trainable': 8181349, 'epoch': '1.061'}
 18%|█████████████████████████████████▉                                                                                                                                                             | 1008/5680 [2:52:14<13:21:08, 10.29s/it] 18%|█████████████████████████████████▉                                                                                                                                                             | 1009/5680 [2:52:22<12:23:57,  9.56s/it]                                                                                                                                                                                                                                             {'loss': '0.7002', 'grad_norm': '0.2418', 'learning_rate': '0.0001849', 'ppl': '2.014', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 8265728, 'tokens/trainable': 8189463, 'epoch': '1.062'}
 18%|█████████████████████████████████▉                                                                                                                                                             | 1009/5680 [2:52:22<12:23:57,  9.56s/it] 18%|█████████████████████████████████▉                                                                                                                                                             | 1010/5680 [2:52:30<11:43:42,  9.04s/it]                                                                                                                                                                                                                                             {'loss': '0.56', 'grad_norm': '0.2308', 'learning_rate': '0.0001848', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8273920, 'tokens/trainable': 8197584, 'epoch': '1.062'}
 18%|█████████████████████████████████▉                                                                                                                                                             | 1010/5680 [2:52:30<11:43:42,  9.04s/it] 18%|█████████████████████████████████▉                                                                                                                                                             | 1011/5680 [2:52:38<11:16:04,  8.69s/it]                                                                                                                                                                                                                                             {'loss': '0.8133', 'grad_norm': '0.2446', 'learning_rate': '0.0001848', 'ppl': '2.255', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8282112, 'tokens/trainable': 8205742, 'epoch': '1.062'}
 18%|█████████████████████████████████▉                                                                                                                                                             | 1011/5680 [2:52:38<11:16:04,  8.69s/it] 18%|██████████████████████████████████                                                                                                                                                             | 1012/5680 [2:52:46<10:55:57,  8.43s/it]                                                                                                                                                                                                                                             {'loss': '0.5632', 'grad_norm': '0.2263', 'learning_rate': '0.0001848', 'ppl': '1.756', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8290304, 'tokens/trainable': 8213816, 'epoch': '1.062'}
 18%|██████████████████████████████████                                                                                                                                                             | 1012/5680 [2:52:46<10:55:57,  8.43s/it] 18%|██████████████████████████████████                                                                                                                                                             | 1013/5680 [2:52:54<10:42:30,  8.26s/it]                                                                                                                                                                                                                                             {'loss': '0.9128', 'grad_norm': '0.2844', 'learning_rate': '0.0001847', 'ppl': '2.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8298496, 'tokens/trainable': 8221985, 'epoch': '1.062'}
 18%|██████████████████████████████████                                                                                                                                                             | 1013/5680 [2:52:54<10:42:30,  8.26s/it] 18%|██████████████████████████████████                                                                                                                                                             | 1014/5680 [2:53:02<10:32:45,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.7075', 'grad_norm': '0.2195', 'learning_rate': '0.0001847', 'ppl': '2.029', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8306688, 'tokens/trainable': 8230108, 'epoch': '1.062'}
 18%|██████████████████████████████████                                                                                                                                                             | 1014/5680 [2:53:02<10:32:45,  8.14s/it] 18%|██████████████████████████████████▏                                                                                                                                                            | 1015/5680 [2:53:09<10:26:16,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.9053', 'grad_norm': '0.2677', 'learning_rate': '0.0001847', 'ppl': '2.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 8314880, 'tokens/trainable': 8238290, 'epoch': '1.063'}
 18%|██████████████████████████████████▏                                                                                                                                                            | 1015/5680 [2:53:09<10:26:16,  8.06s/it] 18%|██████████████████████████████████▏                                                                                                                                                            | 1016/5680 [2:53:17<10:21:32,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6036', 'grad_norm': '0.251', 'learning_rate': '0.0001847', 'ppl': '1.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 8323072, 'tokens/trainable': 8246466, 'epoch': '1.063'}
 18%|██████████████████████████████████▏                                                                                                                                                            | 1016/5680 [2:53:17<10:21:32,  8.00s/it] 18%|██████████████████████████████████▏                                                                                                                                                            | 1017/5680 [2:53:25<10:18:14,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.693', 'grad_norm': '0.2242', 'learning_rate': '0.0001846', 'ppl': '2', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8331264, 'tokens/trainable': 8254600, 'epoch': '1.063'}
 18%|██████████████████████████████████▏                                                                                                                                                            | 1017/5680 [2:53:25<10:18:14,  7.96s/it] 18%|██████████████████████████████████▏                                                                                                                                                            | 1018/5680 [2:53:33<10:15:20,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8559', 'grad_norm': '0.2316', 'learning_rate': '0.0001846', 'ppl': '2.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 8339456, 'tokens/trainable': 8262749, 'epoch': '1.063'}
 18%|██████████████████████████████████▏                                                                                                                                                            | 1018/5680 [2:53:33<10:15:20,  7.92s/it] 18%|██████████████████████████████████▎                                                                                                                                                            | 1019/5680 [2:53:41<10:15:31,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.726', 'grad_norm': '0.2638', 'learning_rate': '0.0001846', 'ppl': '2.067', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8347648, 'tokens/trainable': 8270927, 'epoch': '1.063'}
 18%|██████████████████████████████████▎                                                                                                                                                            | 1019/5680 [2:53:41<10:15:31,  7.92s/it] 18%|██████████████████████████████████▎                                                                                                                                                            | 1020/5680 [2:53:49<10:15:38,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '1.036', 'grad_norm': '0.2644', 'learning_rate': '0.0001845', 'ppl': '2.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 8355840, 'tokens/trainable': 8279078, 'epoch': '1.064'}
 18%|██████████████████████████████████▎                                                                                                                                                            | 1020/5680 [2:53:49<10:15:38,  7.93s/it] 18%|██████████████████████████████████▎                                                                                                                                                            | 1021/5680 [2:53:57<10:15:31,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.9076', 'grad_norm': '0.2408', 'learning_rate': '0.0001845', 'ppl': '2.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 8364032, 'tokens/trainable': 8287241, 'epoch': '1.064'}
 18%|██████████████████████████████████▎                                                                                                                                                            | 1021/5680 [2:53:57<10:15:31,  7.93s/it] 18%|██████████████████████████████████▎                                                                                                                                                            | 1022/5680 [2:54:05<10:16:23,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.001', 'grad_norm': '0.2744', 'learning_rate': '0.0001845', 'ppl': '2.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 8372224, 'tokens/trainable': 8295370, 'epoch': '1.064'}
 18%|██████████████████████████████████▎                                                                                                                                                            | 1022/5680 [2:54:05<10:16:23,  7.94s/it] 18%|██████████████████████████████████▍                                                                                                                                                            | 1023/5680 [2:54:13<10:16:44,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7633', 'grad_norm': '0.2309', 'learning_rate': '0.0001844', 'ppl': '2.145', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 8380416, 'tokens/trainable': 8303536, 'epoch': '1.064'}
 18%|██████████████████████████████████▍                                                                                                                                                            | 1023/5680 [2:54:13<10:16:44,  7.95s/it] 18%|██████████████████████████████████▍                                                                                                                                                            | 1024/5680 [2:54:21<10:16:20,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.727', 'grad_norm': '0.2462', 'learning_rate': '0.0001844', 'ppl': '2.069', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 8388608, 'tokens/trainable': 8311634, 'epoch': '1.064'}
 18%|██████████████████████████████████▍                                                                                                                                                            | 1024/5680 [2:54:21<10:16:20,  7.94s/it] 18%|██████████████████████████████████▍                                                                                                                                                            | 1025/5680 [2:54:29<10:16:38,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7309', 'grad_norm': '0.2223', 'learning_rate': '0.0001844', 'ppl': '2.077', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 8396800, 'tokens/trainable': 8319680, 'epoch': '1.064'}
 18%|██████████████████████████████████▍                                                                                                                                                            | 1025/5680 [2:54:29<10:16:38,  7.95s/it] 18%|██████████████████████████████████▌                                                                                                                                                            | 1026/5680 [2:54:36<10:15:58,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5672', 'grad_norm': '0.2111', 'learning_rate': '0.0001844', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 8404992, 'tokens/trainable': 8327833, 'epoch': '1.065'}
 18%|██████████████████████████████████▌                                                                                                                                                            | 1026/5680 [2:54:36<10:15:58,  7.94s/it] 18%|██████████████████████████████████▌                                                                                                                                                            | 1027/5680 [2:54:44<10:15:45,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.9942', 'grad_norm': '0.3224', 'learning_rate': '0.0001843', 'ppl': '2.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 8413184, 'tokens/trainable': 8336020, 'epoch': '1.065'}
 18%|██████████████████████████████████▌                                                                                                                                                            | 1027/5680 [2:54:44<10:15:45,  7.94s/it] 18%|██████████████████████████████████▌                                                                                                                                                            | 1028/5680 [2:54:52<10:15:31,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7246', 'grad_norm': '0.2428', 'learning_rate': '0.0001843', 'ppl': '2.064', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 8421376, 'tokens/trainable': 8344193, 'epoch': '1.065'}
 18%|██████████████████████████████████▌                                                                                                                                                            | 1028/5680 [2:54:52<10:15:31,  7.94s/it] 18%|██████████████████████████████████▌                                                                                                                                                            | 1029/5680 [2:55:00<10:13:53,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8997', 'grad_norm': '0.2673', 'learning_rate': '0.0001843', 'ppl': '2.459', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8429568, 'tokens/trainable': 8352371, 'epoch': '1.065'}
 18%|██████████████████████████████████▌                                                                                                                                                            | 1029/5680 [2:55:00<10:13:53,  7.92s/it] 18%|██████████████████████████████████▋                                                                                                                                                            | 1030/5680 [2:55:08<10:11:57,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7083', 'grad_norm': '0.2465', 'learning_rate': '0.0001842', 'ppl': '2.031', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8437760, 'tokens/trainable': 8360520, 'epoch': '1.065'}
 18%|██████████████████████████████████▋                                                                                                                                                            | 1030/5680 [2:55:08<10:11:57,  7.90s/it] 18%|██████████████████████████████████▋                                                                                                                                                            | 1031/5680 [2:55:16<10:11:55,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3917', 'grad_norm': '0.1858', 'learning_rate': '0.0001842', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8445952, 'tokens/trainable': 8368699, 'epoch': '1.065'}
 18%|██████████████████████████████████▋                                                                                                                                                            | 1031/5680 [2:55:16<10:11:55,  7.90s/it] 18%|██████████████████████████████████▋                                                                                                                                                            | 1032/5680 [2:55:24<10:12:20,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5554', 'grad_norm': '0.2389', 'learning_rate': '0.0001842', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 8454144, 'tokens/trainable': 8376839, 'epoch': '1.066'}
 18%|██████████████████████████████████▋                                                                                                                                                            | 1032/5680 [2:55:24<10:12:20,  7.90s/it] 18%|██████████████████████████████████▋                                                                                                                                                            | 1033/5680 [2:55:32<10:11:30,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6652', 'grad_norm': '0.2274', 'learning_rate': '0.0001841', 'ppl': '1.945', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 8462336, 'tokens/trainable': 8385029, 'epoch': '1.066'}
 18%|██████████████████████████████████▋                                                                                                                                                            | 1033/5680 [2:55:32<10:11:30,  7.90s/it] 18%|██████████████████████████████████▊                                                                                                                                                            | 1034/5680 [2:55:40<10:11:27,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.692', 'grad_norm': '0.2284', 'learning_rate': '0.0001841', 'ppl': '1.998', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8470528, 'tokens/trainable': 8393203, 'epoch': '1.066'}
 18%|██████████████████████████████████▊                                                                                                                                                            | 1034/5680 [2:55:40<10:11:27,  7.90s/it] 18%|██████████████████████████████████▊                                                                                                                                                            | 1035/5680 [2:55:48<10:11:42,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5905', 'grad_norm': '0.2176', 'learning_rate': '0.0001841', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 8478720, 'tokens/trainable': 8401338, 'epoch': '1.066'}
 18%|██████████████████████████████████▊                                                                                                                                                            | 1035/5680 [2:55:48<10:11:42,  7.90s/it] 18%|██████████████████████████████████▊                                                                                                                                                            | 1036/5680 [2:55:55<10:11:12,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6949', 'grad_norm': '0.239', 'learning_rate': '0.0001841', 'ppl': '2.004', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8486912, 'tokens/trainable': 8409518, 'epoch': '1.066'}
 18%|██████████████████████████████████▊                                                                                                                                                            | 1036/5680 [2:55:55<10:11:12,  7.90s/it] 18%|██████████████████████████████████▊                                                                                                                                                            | 1037/5680 [2:56:03<10:11:16,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.684', 'grad_norm': '0.2567', 'learning_rate': '0.000184', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8495104, 'tokens/trainable': 8417702, 'epoch': '1.067'}
 18%|██████████████████████████████████▊                                                                                                                                                            | 1037/5680 [2:56:03<10:11:16,  7.90s/it] 18%|██████████████████████████████████▉                                                                                                                                                            | 1038/5680 [2:56:11<10:10:16,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8181', 'grad_norm': '0.2416', 'learning_rate': '0.000184', 'ppl': '2.266', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8503296, 'tokens/trainable': 8425847, 'epoch': '1.067'}
 18%|██████████████████████████████████▉                                                                                                                                                            | 1038/5680 [2:56:11<10:10:16,  7.89s/it] 18%|██████████████████████████████████▉                                                                                                                                                            | 1039/5680 [2:56:19<10:09:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6237', 'grad_norm': '0.2153', 'learning_rate': '0.000184', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8511488, 'tokens/trainable': 8434025, 'epoch': '1.067'}
 18%|██████████████████████████████████▉                                                                                                                                                            | 1039/5680 [2:56:19<10:09:49,  7.88s/it] 18%|██████████████████████████████████▉                                                                                                                                                            | 1040/5680 [2:56:27<10:09:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.799', 'grad_norm': '0.2712', 'learning_rate': '0.0001839', 'ppl': '2.223', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 8519680, 'tokens/trainable': 8442071, 'epoch': '1.067'}
 18%|██████████████████████████████████▉                                                                                                                                                            | 1040/5680 [2:56:27<10:09:10,  7.88s/it] 18%|███████████████████████████████████                                                                                                                                                            | 1041/5680 [2:56:35<10:09:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5477', 'grad_norm': '0.2124', 'learning_rate': '0.0001839', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8527872, 'tokens/trainable': 8450214, 'epoch': '1.067'}
 18%|███████████████████████████████████                                                                                                                                                            | 1041/5680 [2:56:35<10:09:14,  7.88s/it] 18%|███████████████████████████████████                                                                                                                                                            | 1042/5680 [2:56:43<10:08:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7337', 'grad_norm': '0.24', 'learning_rate': '0.0001839', 'ppl': '2.083', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 8536064, 'tokens/trainable': 8458385, 'epoch': '1.067'}
 18%|███████████████████████████████████                                                                                                                                                            | 1042/5680 [2:56:43<10:08:08,  7.87s/it] 18%|███████████████████████████████████                                                                                                                                                            | 1043/5680 [2:56:51<10:07:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5881', 'grad_norm': '0.2325', 'learning_rate': '0.0001838', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8544256, 'tokens/trainable': 8466495, 'epoch': '1.068'}
 18%|███████████████████████████████████                                                                                                                                                            | 1043/5680 [2:56:51<10:07:43,  7.86s/it] 18%|███████████████████████████████████                                                                                                                                                            | 1044/5680 [2:56:58<10:07:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.78', 'grad_norm': '0.2586', 'learning_rate': '0.0001838', 'ppl': '2.181', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8552448, 'tokens/trainable': 8474632, 'epoch': '1.068'}
 18%|███████████████████████████████████                                                                                                                                                            | 1044/5680 [2:56:58<10:07:07,  7.86s/it] 18%|███████████████████████████████████▏                                                                                                                                                           | 1045/5680 [2:57:06<10:08:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6563', 'grad_norm': '0.2835', 'learning_rate': '0.0001838', 'ppl': '1.928', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 8560640, 'tokens/trainable': 8482809, 'epoch': '1.068'}
 18%|███████████████████████████████████▏                                                                                                                                                           | 1045/5680 [2:57:06<10:08:11,  7.87s/it] 18%|███████████████████████████████████▏                                                                                                                                                           | 1046/5680 [2:57:14<10:08:59,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4587', 'grad_norm': '0.2165', 'learning_rate': '0.0001838', 'ppl': '1.582', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 8568832, 'tokens/trainable': 8490934, 'epoch': '1.068'}
 18%|███████████████████████████████████▏                                                                                                                                                           | 1046/5680 [2:57:14<10:08:59,  7.89s/it] 18%|███████████████████████████████████▏                                                                                                                                                           | 1047/5680 [2:57:22<10:08:54,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6939', 'grad_norm': '0.2246', 'learning_rate': '0.0001837', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 8577024, 'tokens/trainable': 8499071, 'epoch': '1.068'}
 18%|███████████████████████████████████▏                                                                                                                                                           | 1047/5680 [2:57:22<10:08:54,  7.89s/it] 18%|███████████████████████████████████▏                                                                                                                                                           | 1048/5680 [2:57:30<10:08:16,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5531', 'grad_norm': '0.2194', 'learning_rate': '0.0001837', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 8585216, 'tokens/trainable': 8507202, 'epoch': '1.068'}
 18%|███████████████████████████████████▏                                                                                                                                                           | 1048/5680 [2:57:30<10:08:16,  7.88s/it] 18%|███████████████████████████████████▎                                                                                                                                                           | 1049/5680 [2:57:38<10:07:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5751', 'grad_norm': '0.2605', 'learning_rate': '0.0001837', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8593408, 'tokens/trainable': 8515345, 'epoch': '1.069'}
 18%|███████████████████████████████████▎                                                                                                                                                           | 1049/5680 [2:57:38<10:07:53,  7.88s/it] 18%|███████████████████████████████████▎                                                                                                                                                           | 1050/5680 [2:57:46<10:07:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7828', 'grad_norm': '0.2449', 'learning_rate': '0.0001836', 'ppl': '2.188', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8601600, 'tokens/trainable': 8523503, 'epoch': '1.069'}
 18%|███████████████████████████████████▎                                                                                                                                                           | 1050/5680 [2:57:46<10:07:19,  7.87s/it] 19%|███████████████████████████████████▎                                                                                                                                                           | 1051/5680 [2:57:54<10:08:54,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.471', 'grad_norm': '0.1988', 'learning_rate': '0.0001836', 'ppl': '1.602', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 8609792, 'tokens/trainable': 8531653, 'epoch': '1.069'}
 19%|███████████████████████████████████▎                                                                                                                                                           | 1051/5680 [2:57:54<10:08:54,  7.89s/it] 19%|███████████████████████████████████▍                                                                                                                                                           | 1052/5680 [2:58:02<10:07:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5825', 'grad_norm': '0.2147', 'learning_rate': '0.0001836', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8617984, 'tokens/trainable': 8539781, 'epoch': '1.069'}
 19%|███████████████████████████████████▍                                                                                                                                                           | 1052/5680 [2:58:02<10:07:57,  7.88s/it] 19%|███████████████████████████████████▍                                                                                                                                                           | 1053/5680 [2:58:09<10:07:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8844', 'grad_norm': '0.2548', 'learning_rate': '0.0001835', 'ppl': '2.421', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 8626176, 'tokens/trainable': 8547930, 'epoch': '1.069'}
 19%|███████████████████████████████████▍                                                                                                                                                           | 1053/5680 [2:58:09<10:07:23,  7.88s/it] 19%|███████████████████████████████████▍                                                                                                                                                           | 1054/5680 [2:58:17<10:06:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5932', 'grad_norm': '0.21', 'learning_rate': '0.0001835', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8634368, 'tokens/trainable': 8556038, 'epoch': '1.07'}
 19%|███████████████████████████████████▍                                                                                                                                                           | 1054/5680 [2:58:17<10:06:43,  7.87s/it] 19%|███████████████████████████████████▍                                                                                                                                                           | 1055/5680 [2:58:25<10:06:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7565', 'grad_norm': '0.2453', 'learning_rate': '0.0001835', 'ppl': '2.131', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 8642560, 'tokens/trainable': 8564201, 'epoch': '1.07'}
 19%|███████████████████████████████████▍                                                                                                                                                           | 1055/5680 [2:58:25<10:06:20,  7.87s/it] 19%|███████████████████████████████████▌                                                                                                                                                           | 1056/5680 [2:58:33<10:06:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7473', 'grad_norm': '0.2731', 'learning_rate': '0.0001835', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8650752, 'tokens/trainable': 8572367, 'epoch': '1.07'}
 19%|███████████████████████████████████▌                                                                                                                                                           | 1056/5680 [2:58:33<10:06:12,  7.87s/it] 19%|███████████████████████████████████▌                                                                                                                                                           | 1057/5680 [2:58:41<10:13:14,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5422', 'grad_norm': '0.2068', 'learning_rate': '0.0001834', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.2', 'tokens/total': 8658944, 'tokens/trainable': 8580502, 'epoch': '1.07'}
 19%|███████████████████████████████████▌                                                                                                                                                           | 1057/5680 [2:58:41<10:13:14,  7.96s/it] 19%|███████████████████████████████████▌                                                                                                                                                           | 1058/5680 [2:58:49<10:11:18,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7502', 'grad_norm': '0.2576', 'learning_rate': '0.0001834', 'ppl': '2.117', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 8667136, 'tokens/trainable': 8588573, 'epoch': '1.07'}
 19%|███████████████████████████████████▌                                                                                                                                                           | 1058/5680 [2:58:49<10:11:18,  7.94s/it] 19%|███████████████████████████████████▌                                                                                                                                                           | 1059/5680 [2:58:57<10:10:02,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5955', 'grad_norm': '0.2719', 'learning_rate': '0.0001834', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8675328, 'tokens/trainable': 8596722, 'epoch': '1.07'}
 19%|███████████████████████████████████▌                                                                                                                                                           | 1059/5680 [2:58:57<10:10:02,  7.92s/it] 19%|███████████████████████████████████▋                                                                                                                                                           | 1060/5680 [2:59:05<10:08:29,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4301', 'grad_norm': '0.1879', 'learning_rate': '0.0001833', 'ppl': '1.537', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 8683520, 'tokens/trainable': 8604829, 'epoch': '1.071'}
 19%|███████████████████████████████████▋                                                                                                                                                           | 1060/5680 [2:59:05<10:08:29,  7.90s/it] 19%|███████████████████████████████████▋                                                                                                                                                           | 1061/5680 [2:59:13<10:08:11,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6286', 'grad_norm': '0.2598', 'learning_rate': '0.0001833', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 8691712, 'tokens/trainable': 8612953, 'epoch': '1.071'}
 19%|███████████████████████████████████▋                                                                                                                                                           | 1061/5680 [2:59:13<10:08:11,  7.90s/it] 19%|███████████████████████████████████▋                                                                                                                                                           | 1062/5680 [2:59:21<10:07:36,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6244', 'grad_norm': '0.2561', 'learning_rate': '0.0001833', 'ppl': '1.867', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8699904, 'tokens/trainable': 8621134, 'epoch': '1.071'}
 19%|███████████████████████████████████▋                                                                                                                                                           | 1062/5680 [2:59:21<10:07:36,  7.89s/it] 19%|███████████████████████████████████▋                                                                                                                                                           | 1063/5680 [2:59:28<10:06:36,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8168', 'grad_norm': '0.2603', 'learning_rate': '0.0001832', 'ppl': '2.263', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8708096, 'tokens/trainable': 8629263, 'epoch': '1.071'}
 19%|███████████████████████████████████▋                                                                                                                                                           | 1063/5680 [2:59:28<10:06:36,  7.88s/it] 19%|███████████████████████████████████▊                                                                                                                                                           | 1064/5680 [2:59:36<10:05:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.761', 'grad_norm': '0.2506', 'learning_rate': '0.0001832', 'ppl': '2.14', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8716288, 'tokens/trainable': 8637409, 'epoch': '1.071'}
 19%|███████████████████████████████████▊                                                                                                                                                           | 1064/5680 [2:59:36<10:05:05,  7.87s/it] 19%|███████████████████████████████████▊                                                                                                                                                           | 1065/5680 [2:59:44<10:04:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5362', 'grad_norm': '0.2357', 'learning_rate': '0.0001832', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 8724480, 'tokens/trainable': 8645475, 'epoch': '1.071'}
 19%|███████████████████████████████████▊                                                                                                                                                           | 1065/5680 [2:59:44<10:04:45,  7.86s/it] 19%|███████████████████████████████████▊                                                                                                                                                           | 1066/5680 [2:59:52<10:05:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5845', 'grad_norm': '0.2341', 'learning_rate': '0.0001831', 'ppl': '1.794', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8732672, 'tokens/trainable': 8653612, 'epoch': '1.072'}
 19%|███████████████████████████████████▊                                                                                                                                                           | 1066/5680 [2:59:52<10:05:19,  7.87s/it] 19%|███████████████████████████████████▉                                                                                                                                                           | 1067/5680 [3:00:00<10:04:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7729', 'grad_norm': '0.2825', 'learning_rate': '0.0001831', 'ppl': '2.166', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 8740864, 'tokens/trainable': 8661783, 'epoch': '1.072'}
 19%|███████████████████████████████████▉                                                                                                                                                           | 1067/5680 [3:00:00<10:04:49,  7.87s/it] 19%|███████████████████████████████████▉                                                                                                                                                           | 1068/5680 [3:00:08<10:04:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5575', 'grad_norm': '0.2133', 'learning_rate': '0.0001831', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 8749056, 'tokens/trainable': 8669876, 'epoch': '1.072'}
 19%|███████████████████████████████████▉                                                                                                                                                           | 1068/5680 [3:00:08<10:04:53,  7.87s/it] 19%|███████████████████████████████████▉                                                                                                                                                           | 1069/5680 [3:00:16<10:04:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7001', 'grad_norm': '0.2581', 'learning_rate': '0.0001831', 'ppl': '2.014', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 8757248, 'tokens/trainable': 8677994, 'epoch': '1.072'}
 19%|███████████████████████████████████▉                                                                                                                                                           | 1069/5680 [3:00:16<10:04:49,  7.87s/it] 19%|███████████████████████████████████▉                                                                                                                                                           | 1070/5680 [3:00:24<10:11:08,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7021', 'grad_norm': '0.2295', 'learning_rate': '0.000183', 'ppl': '2.018', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 8765440, 'tokens/trainable': 8686158, 'epoch': '1.072'}
 19%|███████████████████████████████████▉                                                                                                                                                           | 1070/5680 [3:00:24<10:11:08,  7.95s/it] 19%|████████████████████████████████████                                                                                                                                                           | 1071/5680 [3:00:32<10:08:27,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8943', 'grad_norm': '0.2701', 'learning_rate': '0.000183', 'ppl': '2.446', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 8773632, 'tokens/trainable': 8694343, 'epoch': '1.073'}
 19%|████████████████████████████████████                                                                                                                                                           | 1071/5680 [3:00:32<10:08:27,  7.92s/it] 19%|████████████████████████████████████                                                                                                                                                           | 1072/5680 [3:00:39<10:07:02,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8644', 'grad_norm': '0.2602', 'learning_rate': '0.000183', 'ppl': '2.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8781824, 'tokens/trainable': 8702462, 'epoch': '1.073'}
 19%|████████████████████████████████████                                                                                                                                                           | 1072/5680 [3:00:39<10:07:02,  7.90s/it] 19%|████████████████████████████████████                                                                                                                                                           | 1073/5680 [3:00:47<10:05:51,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8859', 'grad_norm': '0.2899', 'learning_rate': '0.0001829', 'ppl': '2.425', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8790016, 'tokens/trainable': 8710648, 'epoch': '1.073'}
 19%|████████████████████████████████████                                                                                                                                                           | 1073/5680 [3:00:47<10:05:51,  7.89s/it] 19%|████████████████████████████████████                                                                                                                                                           | 1074/5680 [3:00:55<10:04:44,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7133', 'grad_norm': '0.2337', 'learning_rate': '0.0001829', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8798208, 'tokens/trainable': 8718814, 'epoch': '1.073'}
 19%|████████████████████████████████████                                                                                                                                                           | 1074/5680 [3:00:55<10:04:44,  7.88s/it] 19%|████████████████████████████████████▏                                                                                                                                                          | 1075/5680 [3:01:03<10:03:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6639', 'grad_norm': '0.2622', 'learning_rate': '0.0001829', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8806400, 'tokens/trainable': 8726922, 'epoch': '1.073'}
 19%|████████████████████████████████████▏                                                                                                                                                          | 1075/5680 [3:01:03<10:03:37,  7.86s/it] 19%|████████████████████████████████████▏                                                                                                                                                          | 1076/5680 [3:01:11<10:02:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8336', 'grad_norm': '0.252', 'learning_rate': '0.0001828', 'ppl': '2.302', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 8814592, 'tokens/trainable': 8735080, 'epoch': '1.073'}
 19%|████████████████████████████████████▏                                                                                                                                                          | 1076/5680 [3:01:11<10:02:20,  7.85s/it] 19%|████████████████████████████████████▏                                                                                                                                                          | 1077/5680 [3:01:19<10:02:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.795', 'grad_norm': '0.2624', 'learning_rate': '0.0001828', 'ppl': '2.214', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 8822784, 'tokens/trainable': 8743187, 'epoch': '1.074'}
 19%|████████████████████████████████████▏                                                                                                                                                          | 1077/5680 [3:01:19<10:02:20,  7.85s/it] 19%|████████████████████████████████████▏                                                                                                                                                          | 1078/5680 [3:01:26<10:02:09,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5546', 'grad_norm': '0.2069', 'learning_rate': '0.0001828', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8830976, 'tokens/trainable': 8751365, 'epoch': '1.074'}
 19%|████████████████████████████████████▏                                                                                                                                                          | 1078/5680 [3:01:26<10:02:09,  7.85s/it] 19%|████████████████████████████████████▎                                                                                                                                                          | 1079/5680 [3:01:34<10:02:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8167', 'grad_norm': '0.2648', 'learning_rate': '0.0001827', 'ppl': '2.263', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8839168, 'tokens/trainable': 8759483, 'epoch': '1.074'}
 19%|████████████████████████████████████▎                                                                                                                                                          | 1079/5680 [3:01:34<10:02:20,  7.85s/it] 19%|████████████████████████████████████▎                                                                                                                                                          | 1080/5680 [3:01:42<10:01:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4995', 'grad_norm': '0.1914', 'learning_rate': '0.0001827', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8847360, 'tokens/trainable': 8767605, 'epoch': '1.074'}
 19%|████████████████████████████████████▎                                                                                                                                                          | 1080/5680 [3:01:42<10:01:50,  7.85s/it] 19%|████████████████████████████████████▎                                                                                                                                                          | 1081/5680 [3:01:50<10:01:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8252', 'grad_norm': '0.2264', 'learning_rate': '0.0001827', 'ppl': '2.282', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 8855552, 'tokens/trainable': 8775779, 'epoch': '1.074'}
 19%|████████████████████████████████████▎                                                                                                                                                          | 1081/5680 [3:01:50<10:01:46,  7.85s/it] 19%|████████████████████████████████████▍                                                                                                                                                          | 1082/5680 [3:01:58<10:01:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7769', 'grad_norm': '0.2437', 'learning_rate': '0.0001827', 'ppl': '2.175', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8863744, 'tokens/trainable': 8783959, 'epoch': '1.074'}
 19%|████████████████████████████████████▍                                                                                                                                                          | 1082/5680 [3:01:58<10:01:39,  7.85s/it] 19%|████████████████████████████████████▍                                                                                                                                                          | 1083/5680 [3:02:06<10:01:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5056', 'grad_norm': '0.2336', 'learning_rate': '0.0001826', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 8871936, 'tokens/trainable': 8792087, 'epoch': '1.075'}
 19%|████████████████████████████████████▍                                                                                                                                                          | 1083/5680 [3:02:06<10:01:22,  7.85s/it] 19%|████████████████████████████████████▍                                                                                                                                                          | 1084/5680 [3:02:14<10:00:51,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.658', 'grad_norm': '0.2499', 'learning_rate': '0.0001826', 'ppl': '1.931', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 8880128, 'tokens/trainable': 8800246, 'epoch': '1.075'}
 19%|████████████████████████████████████▍                                                                                                                                                          | 1084/5680 [3:02:14<10:00:51,  7.84s/it] 19%|████████████████████████████████████▍                                                                                                                                                          | 1085/5680 [3:02:21<10:02:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.596', 'grad_norm': '0.215', 'learning_rate': '0.0001826', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8888320, 'tokens/trainable': 8808418, 'epoch': '1.075'}
 19%|████████████████████████████████████▍                                                                                                                                                          | 1085/5680 [3:02:21<10:02:02,  7.86s/it] 19%|████████████████████████████████████▌                                                                                                                                                          | 1086/5680 [3:02:29<10:01:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6341', 'grad_norm': '0.212', 'learning_rate': '0.0001825', 'ppl': '1.885', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 8896512, 'tokens/trainable': 8816583, 'epoch': '1.075'}
 19%|████████████████████████████████████▌                                                                                                                                                          | 1086/5680 [3:02:29<10:01:10,  7.85s/it] 19%|████████████████████████████████████▌                                                                                                                                                          | 1087/5680 [3:02:37<10:02:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7918', 'grad_norm': '0.2386', 'learning_rate': '0.0001825', 'ppl': '2.207', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 8904704, 'tokens/trainable': 8824714, 'epoch': '1.075'}
 19%|████████████████████████████████████▌                                                                                                                                                          | 1087/5680 [3:02:37<10:02:59,  7.88s/it] 19%|████████████████████████████████████▌                                                                                                                                                          | 1088/5680 [3:02:45<10:04:18,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3987', 'grad_norm': '0.1806', 'learning_rate': '0.0001825', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 8912896, 'tokens/trainable': 8832811, 'epoch': '1.076'}
 19%|████████████████████████████████████▌                                                                                                                                                          | 1088/5680 [3:02:45<10:04:18,  7.90s/it] 19%|████████████████████████████████████▌                                                                                                                                                          | 1089/5680 [3:02:53<10:04:27,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7623', 'grad_norm': '0.2338', 'learning_rate': '0.0001824', 'ppl': '2.143', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 8921088, 'tokens/trainable': 8840932, 'epoch': '1.076'}
 19%|████████████████████████████████████▌                                                                                                                                                          | 1089/5680 [3:02:53<10:04:27,  7.90s/it] 19%|████████████████████████████████████▋                                                                                                                                                          | 1090/5680 [3:03:01<10:04:20,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5334', 'grad_norm': '0.2259', 'learning_rate': '0.0001824', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 8929280, 'tokens/trainable': 8848996, 'epoch': '1.076'}
 19%|████████████████████████████████████▋                                                                                                                                                          | 1090/5680 [3:03:01<10:04:20,  7.90s/it] 19%|████████████████████████████████████▋                                                                                                                                                          | 1091/5680 [3:03:09<10:02:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7165', 'grad_norm': '0.2469', 'learning_rate': '0.0001824', 'ppl': '2.047', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 8937472, 'tokens/trainable': 8857175, 'epoch': '1.076'}
 19%|████████████████████████████████████▋                                                                                                                                                          | 1091/5680 [3:03:09<10:02:59,  7.88s/it] 19%|████████████████████████████████████▋                                                                                                                                                          | 1092/5680 [3:03:17<10:02:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7082', 'grad_norm': '0.2251', 'learning_rate': '0.0001823', 'ppl': '2.03', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 8945664, 'tokens/trainable': 8865284, 'epoch': '1.076'}
 19%|████████████████████████████████████▋                                                                                                                                                          | 1092/5680 [3:03:17<10:02:15,  7.88s/it] 19%|████████████████████████████████████▊                                                                                                                                                          | 1093/5680 [3:03:25<10:01:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6576', 'grad_norm': '0.2137', 'learning_rate': '0.0001823', 'ppl': '1.93', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8953856, 'tokens/trainable': 8873414, 'epoch': '1.076'}
 19%|████████████████████████████████████▊                                                                                                                                                          | 1093/5680 [3:03:25<10:01:36,  7.87s/it] 19%|████████████████████████████████████▊                                                                                                                                                          | 1094/5680 [3:03:32<10:01:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7059', 'grad_norm': '0.2588', 'learning_rate': '0.0001823', 'ppl': '2.026', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 8962048, 'tokens/trainable': 8881546, 'epoch': '1.077'}
 19%|████████████████████████████████████▊                                                                                                                                                          | 1094/5680 [3:03:32<10:01:14,  7.87s/it] 19%|████████████████████████████████████▊                                                                                                                                                          | 1095/5680 [3:03:40<10:01:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8083', 'grad_norm': '0.2315', 'learning_rate': '0.0001822', 'ppl': '2.244', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 8970240, 'tokens/trainable': 8889713, 'epoch': '1.077'}
 19%|████████████████████████████████████▊                                                                                                                                                          | 1095/5680 [3:03:40<10:01:15,  7.87s/it] 19%|████████████████████████████████████▊                                                                                                                                                          | 1096/5680 [3:03:48<10:01:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4869', 'grad_norm': '0.2162', 'learning_rate': '0.0001822', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 8978432, 'tokens/trainable': 8897853, 'epoch': '1.077'}
 19%|████████████████████████████████████▊                                                                                                                                                          | 1096/5680 [3:03:48<10:01:18,  7.87s/it] 19%|████████████████████████████████████▉                                                                                                                                                          | 1097/5680 [3:03:56<10:02:42,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8829', 'grad_norm': '0.2528', 'learning_rate': '0.0001822', 'ppl': '2.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 8986624, 'tokens/trainable': 8905979, 'epoch': '1.077'}
 19%|████████████████████████████████████▉                                                                                                                                                          | 1097/5680 [3:03:56<10:02:42,  7.89s/it] 19%|████████████████████████████████████▉                                                                                                                                                          | 1098/5680 [3:04:04<10:02:23,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.784', 'grad_norm': '0.2568', 'learning_rate': '0.0001822', 'ppl': '2.19', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 8994816, 'tokens/trainable': 8914118, 'epoch': '1.077'}
 19%|████████████████████████████████████▉                                                                                                                                                          | 1098/5680 [3:04:04<10:02:23,  7.89s/it] 19%|████████████████████████████████████▉                                                                                                                                                          | 1099/5680 [3:04:12<10:01:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8072', 'grad_norm': '0.2853', 'learning_rate': '0.0001821', 'ppl': '2.242', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 9003008, 'tokens/trainable': 8922267, 'epoch': '1.077'}
 19%|████████████████████████████████████▉                                                                                                                                                          | 1099/5680 [3:04:12<10:01:41,  7.88s/it] 19%|████████████████████████████████████▉                                                                                                                                                          | 1100/5680 [3:04:20<10:01:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7629', 'grad_norm': '0.2599', 'learning_rate': '0.0001821', 'ppl': '2.145', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 9011200, 'tokens/trainable': 8930377, 'epoch': '1.078'}
 19%|████████████████████████████████████▉                                                                                                                                                          | 1100/5680 [3:04:20<10:01:37,  7.88s/it] 19%|█████████████████████████████████████                                                                                                                                                          | 1101/5680 [3:04:28<10:00:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5537', 'grad_norm': '0.2146', 'learning_rate': '0.0001821', 'ppl': '1.74', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9019392, 'tokens/trainable': 8938448, 'epoch': '1.078'}
 19%|█████████████████████████████████████                                                                                                                                                          | 1101/5680 [3:04:28<10:00:56,  7.87s/it] 19%|█████████████████████████████████████                                                                                                                                                          | 1102/5680 [3:04:35<10:00:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6474', 'grad_norm': '0.2078', 'learning_rate': '0.000182', 'ppl': '1.911', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 9027584, 'tokens/trainable': 8946560, 'epoch': '1.078'}
 19%|█████████████████████████████████████                                                                                                                                                          | 1102/5680 [3:04:35<10:00:43,  7.87s/it] 19%|█████████████████████████████████████                                                                                                                                                          | 1103/5680 [3:04:43<10:00:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6863', 'grad_norm': '0.2242', 'learning_rate': '0.000182', 'ppl': '1.986', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 9035776, 'tokens/trainable': 8954614, 'epoch': '1.078'}
 19%|█████████████████████████████████████                                                                                                                                                          | 1103/5680 [3:04:43<10:00:24,  7.87s/it] 19%|█████████████████████████████████████▎                                                                                                                                                          | 1104/5680 [3:04:51<9:59:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7471', 'grad_norm': '0.2443', 'learning_rate': '0.000182', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 9043968, 'tokens/trainable': 8962714, 'epoch': '1.078'}
 19%|█████████████████████████████████████▎                                                                                                                                                          | 1104/5680 [3:04:51<9:59:16,  7.86s/it] 19%|█████████████████████████████████████▎                                                                                                                                                          | 1105/5680 [3:04:59<9:58:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7449', 'grad_norm': '0.2327', 'learning_rate': '0.0001819', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 9052160, 'tokens/trainable': 8970867, 'epoch': '1.079'}
 19%|█████████████████████████████████████▎                                                                                                                                                          | 1105/5680 [3:04:59<9:58:57,  7.86s/it] 19%|█████████████████████████████████████▍                                                                                                                                                          | 1106/5680 [3:05:07<9:59:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6851', 'grad_norm': '0.2639', 'learning_rate': '0.0001819', 'ppl': '1.984', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9060352, 'tokens/trainable': 8979007, 'epoch': '1.079'}
 19%|█████████████████████████████████████▍                                                                                                                                                          | 1106/5680 [3:05:07<9:59:38,  7.87s/it] 19%|█████████████████████████████████████▏                                                                                                                                                         | 1107/5680 [3:05:15<10:00:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6033', 'grad_norm': '0.2503', 'learning_rate': '0.0001819', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 9068544, 'tokens/trainable': 8987097, 'epoch': '1.079'}
 19%|█████████████████████████████████████▏                                                                                                                                                         | 1107/5680 [3:05:15<10:00:21,  7.88s/it] 20%|█████████████████████████████████████▍                                                                                                                                                          | 1108/5680 [3:05:23<9:59:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6099', 'grad_norm': '0.2406', 'learning_rate': '0.0001818', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9076736, 'tokens/trainable': 8995211, 'epoch': '1.079'}
 20%|█████████████████████████████████████▍                                                                                                                                                          | 1108/5680 [3:05:23<9:59:57,  7.87s/it] 20%|█████████████████████████████████████▎                                                                                                                                                         | 1109/5680 [3:05:31<10:00:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.689', 'grad_norm': '0.2451', 'learning_rate': '0.0001818', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9084928, 'tokens/trainable': 9003370, 'epoch': '1.079'}
 20%|█████████████████████████████████████▎                                                                                                                                                         | 1109/5680 [3:05:31<10:00:23,  7.88s/it] 20%|█████████████████████████████████████▌                                                                                                                                                          | 1110/5680 [3:05:38<9:59:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8096', 'grad_norm': '0.2434', 'learning_rate': '0.0001818', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 9093120, 'tokens/trainable': 9011537, 'epoch': '1.079'}
 20%|█████████████████████████████████████▌                                                                                                                                                          | 1110/5680 [3:05:38<9:59:14,  7.87s/it] 20%|█████████████████████████████████████▎                                                                                                                                                         | 1111/5680 [3:05:46<10:01:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6057', 'grad_norm': '0.2177', 'learning_rate': '0.0001817', 'ppl': '1.833', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 9101312, 'tokens/trainable': 9019607, 'epoch': '1.08'}
 20%|█████████████████████████████████████▎                                                                                                                                                         | 1111/5680 [3:05:46<10:01:07,  7.89s/it] 20%|█████████████████████████████████████▍                                                                                                                                                         | 1112/5680 [3:05:54<10:02:21,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7549', 'grad_norm': '0.2424', 'learning_rate': '0.0001817', 'ppl': '2.127', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 9109504, 'tokens/trainable': 9027693, 'epoch': '1.08'}
 20%|█████████████████████████████████████▍                                                                                                                                                         | 1112/5680 [3:05:54<10:02:21,  7.91s/it] 20%|█████████████████████████████████████▍                                                                                                                                                         | 1113/5680 [3:06:02<10:02:47,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8646', 'grad_norm': '0.2581', 'learning_rate': '0.0001817', 'ppl': '2.374', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 9117696, 'tokens/trainable': 9035801, 'epoch': '1.08'}
 20%|█████████████████████████████████████▍                                                                                                                                                         | 1113/5680 [3:06:02<10:02:47,  7.92s/it] 20%|█████████████████████████████████████▍                                                                                                                                                         | 1114/5680 [3:06:10<10:02:45,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '1.046', 'grad_norm': '0.2583', 'learning_rate': '0.0001816', 'ppl': '2.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 9125888, 'tokens/trainable': 9043930, 'epoch': '1.08'}
 20%|█████████████████████████████████████▍                                                                                                                                                         | 1114/5680 [3:06:10<10:02:45,  7.92s/it] 20%|█████████████████████████████████████▍                                                                                                                                                         | 1115/5680 [3:06:18<10:00:30,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.611', 'grad_norm': '0.2454', 'learning_rate': '0.0001816', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 9134080, 'tokens/trainable': 9051975, 'epoch': '1.08'}
 20%|█████████████████████████████████████▍                                                                                                                                                         | 1115/5680 [3:06:18<10:00:30,  7.89s/it] 20%|█████████████████████████████████████▌                                                                                                                                                         | 1116/5680 [3:06:26<10:01:39,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7991', 'grad_norm': '0.2481', 'learning_rate': '0.0001816', 'ppl': '2.224', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 9142272, 'tokens/trainable': 9060068, 'epoch': '1.08'}
 20%|█████████████████████████████████████▌                                                                                                                                                         | 1116/5680 [3:06:26<10:01:39,  7.91s/it] 20%|█████████████████████████████████████▌                                                                                                                                                         | 1117/5680 [3:06:34<10:02:03,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5573', 'grad_norm': '0.2338', 'learning_rate': '0.0001815', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 9150464, 'tokens/trainable': 9068101, 'epoch': '1.081'}
 20%|█████████████████████████████████████▌                                                                                                                                                         | 1117/5680 [3:06:34<10:02:03,  7.92s/it] 20%|█████████████████████████████████████▌                                                                                                                                                         | 1118/5680 [3:06:42<10:01:45,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7202', 'grad_norm': '0.2521', 'learning_rate': '0.0001815', 'ppl': '2.055', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 9158656, 'tokens/trainable': 9076252, 'epoch': '1.081'}
 20%|█████████████████████████████████████▌                                                                                                                                                         | 1118/5680 [3:06:42<10:01:45,  7.91s/it] 20%|█████████████████████████████████████▋                                                                                                                                                         | 1119/5680 [3:06:50<10:01:01,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3945', 'grad_norm': '0.2046', 'learning_rate': '0.0001815', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 9166848, 'tokens/trainable': 9084364, 'epoch': '1.081'}
 20%|█████████████████████████████████████▋                                                                                                                                                         | 1119/5680 [3:06:50<10:01:01,  7.91s/it] 20%|█████████████████████████████████████▋                                                                                                                                                         | 1120/5680 [3:06:58<10:01:26,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.561', 'grad_norm': '0.2561', 'learning_rate': '0.0001815', 'ppl': '1.752', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 9175040, 'tokens/trainable': 9092532, 'epoch': '1.081'}
 20%|█████████████████████████████████████▋                                                                                                                                                         | 1120/5680 [3:06:58<10:01:26,  7.91s/it] 20%|█████████████████████████████████████▋                                                                                                                                                         | 1121/5680 [3:07:05<10:01:00,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.836', 'grad_norm': '0.2928', 'learning_rate': '0.0001814', 'ppl': '2.307', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9183232, 'tokens/trainable': 9100646, 'epoch': '1.081'}
 20%|█████████████████████████████████████▋                                                                                                                                                         | 1121/5680 [3:07:05<10:01:00,  7.91s/it] 20%|█████████████████████████████████████▋                                                                                                                                                         | 1122/5680 [3:07:13<10:00:01,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6019', 'grad_norm': '0.2161', 'learning_rate': '0.0001814', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 9191424, 'tokens/trainable': 9108662, 'epoch': '1.082'}
 20%|█████████████████████████████████████▋                                                                                                                                                         | 1122/5680 [3:07:13<10:00:01,  7.90s/it] 20%|█████████████████████████████████████▉                                                                                                                                                          | 1123/5680 [3:07:21<9:59:22,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.679', 'grad_norm': '0.2457', 'learning_rate': '0.0001814', 'ppl': '1.972', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9199616, 'tokens/trainable': 9116745, 'epoch': '1.082'}
 20%|█████████████████████████████████████▉                                                                                                                                                          | 1123/5680 [3:07:21<9:59:22,  7.89s/it] 20%|█████████████████████████████████████▊                                                                                                                                                         | 1124/5680 [3:07:29<10:00:36,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6113', 'grad_norm': '0.2896', 'learning_rate': '0.0001813', 'ppl': '1.843', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 9207808, 'tokens/trainable': 9124901, 'epoch': '1.082'}
 20%|█████████████████████████████████████▊                                                                                                                                                         | 1124/5680 [3:07:29<10:00:36,  7.91s/it] 20%|█████████████████████████████████████▊                                                                                                                                                         | 1125/5680 [3:07:37<10:01:38,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6724', 'grad_norm': '0.2443', 'learning_rate': '0.0001813', 'ppl': '1.959', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 9216000, 'tokens/trainable': 9133024, 'epoch': '1.082'}
 20%|█████████████████████████████████████▊                                                                                                                                                         | 1125/5680 [3:07:37<10:01:38,  7.92s/it] 20%|█████████████████████████████████████▊                                                                                                                                                         | 1126/5680 [3:07:45<10:01:43,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6797', 'grad_norm': '0.2273', 'learning_rate': '0.0001813', 'ppl': '1.973', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 9224192, 'tokens/trainable': 9141123, 'epoch': '1.082'}
 20%|█████████████████████████████████████▊                                                                                                                                                         | 1126/5680 [3:07:45<10:01:43,  7.93s/it] 20%|█████████████████████████████████████▉                                                                                                                                                         | 1127/5680 [3:07:53<10:01:51,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.8331', 'grad_norm': '0.2506', 'learning_rate': '0.0001812', 'ppl': '2.3', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 9232384, 'tokens/trainable': 9149307, 'epoch': '1.082'}
 20%|█████████████████████████████████████▉                                                                                                                                                         | 1127/5680 [3:07:53<10:01:51,  7.93s/it] 20%|█████████████████████████████████████▉                                                                                                                                                         | 1128/5680 [3:08:01<10:01:54,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4716', 'grad_norm': '0.1884', 'learning_rate': '0.0001812', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 9240576, 'tokens/trainable': 9157391, 'epoch': '1.083'}
 20%|█████████████████████████████████████▉                                                                                                                                                         | 1128/5680 [3:08:01<10:01:54,  7.93s/it] 20%|█████████████████████████████████████▉                                                                                                                                                         | 1129/5680 [3:08:09<10:01:45,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.9015', 'grad_norm': '0.274', 'learning_rate': '0.0001812', 'ppl': '2.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 9248768, 'tokens/trainable': 9165514, 'epoch': '1.083'}
 20%|█████████████████████████████████████▉                                                                                                                                                         | 1129/5680 [3:08:09<10:01:45,  7.93s/it] 20%|█████████████████████████████████████▉                                                                                                                                                         | 1130/5680 [3:08:17<10:01:09,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6536', 'grad_norm': '0.2147', 'learning_rate': '0.0001811', 'ppl': '1.922', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9256960, 'tokens/trainable': 9173675, 'epoch': '1.083'}
 20%|█████████████████████████████████████▉                                                                                                                                                         | 1130/5680 [3:08:17<10:01:09,  7.93s/it] 20%|██████████████████████████████████████▏                                                                                                                                                         | 1131/5680 [3:08:25<9:58:59,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8872', 'grad_norm': '0.2607', 'learning_rate': '0.0001811', 'ppl': '2.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9265152, 'tokens/trainable': 9181794, 'epoch': '1.083'}
 20%|██████████████████████████████████████▏                                                                                                                                                         | 1131/5680 [3:08:25<9:58:59,  7.90s/it] 20%|██████████████████████████████████████▎                                                                                                                                                         | 1132/5680 [3:08:32<9:57:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8646', 'grad_norm': '0.2612', 'learning_rate': '0.0001811', 'ppl': '2.374', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9273344, 'tokens/trainable': 9189884, 'epoch': '1.083'}
 20%|██████████████████████████████████████▎                                                                                                                                                         | 1132/5680 [3:08:32<9:57:18,  7.88s/it] 20%|██████████████████████████████████████▎                                                                                                                                                         | 1133/5680 [3:08:40<9:56:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6833', 'grad_norm': '0.2323', 'learning_rate': '0.000181', 'ppl': '1.98', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9281536, 'tokens/trainable': 9198000, 'epoch': '1.083'}
 20%|██████████████████████████████████████▎                                                                                                                                                         | 1133/5680 [3:08:40<9:56:39,  7.87s/it] 20%|██████████████████████████████████████▎                                                                                                                                                         | 1134/5680 [3:08:48<9:56:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7972', 'grad_norm': '0.2209', 'learning_rate': '0.000181', 'ppl': '2.219', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9289728, 'tokens/trainable': 9206135, 'epoch': '1.084'}
 20%|██████████████████████████████████████▎                                                                                                                                                         | 1134/5680 [3:08:48<9:56:04,  7.87s/it] 20%|██████████████████████████████████████▎                                                                                                                                                         | 1135/5680 [3:08:56<9:56:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '1.262', 'grad_norm': '0.3165', 'learning_rate': '0.000181', 'ppl': '3.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 9297920, 'tokens/trainable': 9214253, 'epoch': '1.084'}
 20%|██████████████████████████████████████▎                                                                                                                                                         | 1135/5680 [3:08:56<9:56:17,  7.87s/it] 20%|██████████████████████████████████████▍                                                                                                                                                         | 1136/5680 [3:09:04<9:55:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7745', 'grad_norm': '0.2461', 'learning_rate': '0.0001809', 'ppl': '2.169', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 9306112, 'tokens/trainable': 9222356, 'epoch': '1.084'}
 20%|██████████████████████████████████████▍                                                                                                                                                         | 1136/5680 [3:09:04<9:55:09,  7.86s/it] 20%|██████████████████████████████████████▍                                                                                                                                                         | 1137/5680 [3:09:12<9:55:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7098', 'grad_norm': '0.2278', 'learning_rate': '0.0001809', 'ppl': '2.033', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9314304, 'tokens/trainable': 9230528, 'epoch': '1.084'}
 20%|██████████████████████████████████████▍                                                                                                                                                         | 1137/5680 [3:09:12<9:55:44,  7.87s/it] 20%|██████████████████████████████████████▍                                                                                                                                                         | 1138/5680 [3:09:20<9:55:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8431', 'grad_norm': '0.2579', 'learning_rate': '0.0001809', 'ppl': '2.324', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9322496, 'tokens/trainable': 9238637, 'epoch': '1.084'}
 20%|██████████████████████████████████████▍                                                                                                                                                         | 1138/5680 [3:09:20<9:55:11,  7.86s/it] 20%|██████████████████████████████████████▌                                                                                                                                                         | 1139/5680 [3:09:28<9:55:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.562', 'grad_norm': '0.2149', 'learning_rate': '0.0001808', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 9330688, 'tokens/trainable': 9246710, 'epoch': '1.085'}
 20%|██████████████████████████████████████▌                                                                                                                                                         | 1139/5680 [3:09:28<9:55:34,  7.87s/it] 20%|██████████████████████████████████████▌                                                                                                                                                         | 1140/5680 [3:09:35<9:55:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7511', 'grad_norm': '0.2456', 'learning_rate': '0.0001808', 'ppl': '2.119', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 9338880, 'tokens/trainable': 9254854, 'epoch': '1.085'}
 20%|██████████████████████████████████████▌                                                                                                                                                         | 1140/5680 [3:09:35<9:55:07,  7.87s/it] 20%|██████████████████████████████████████▌                                                                                                                                                         | 1141/5680 [3:09:43<9:55:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7286', 'grad_norm': '0.2435', 'learning_rate': '0.0001808', 'ppl': '2.072', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 9347072, 'tokens/trainable': 9263001, 'epoch': '1.085'}
 20%|██████████████████████████████████████▌                                                                                                                                                         | 1141/5680 [3:09:43<9:55:24,  7.87s/it] 20%|██████████████████████████████████████▌                                                                                                                                                         | 1142/5680 [3:09:51<9:54:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6828', 'grad_norm': '0.2678', 'learning_rate': '0.0001807', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 9355264, 'tokens/trainable': 9271130, 'epoch': '1.085'}
 20%|██████████████████████████████████████▌                                                                                                                                                         | 1142/5680 [3:09:51<9:54:37,  7.86s/it] 20%|██████████████████████████████████████▋                                                                                                                                                         | 1143/5680 [3:09:59<9:54:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3754', 'grad_norm': '0.1795', 'learning_rate': '0.0001807', 'ppl': '1.456', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 9363456, 'tokens/trainable': 9279311, 'epoch': '1.085'}
 20%|██████████████████████████████████████▋                                                                                                                                                         | 1143/5680 [3:09:59<9:54:35,  7.86s/it] 20%|██████████████████████████████████████▋                                                                                                                                                         | 1144/5680 [3:10:07<9:55:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7725', 'grad_norm': '0.364', 'learning_rate': '0.0001807', 'ppl': '2.165', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9371648, 'tokens/trainable': 9287451, 'epoch': '1.085'}
 20%|██████████████████████████████████████▋                                                                                                                                                         | 1144/5680 [3:10:07<9:55:05,  7.87s/it] 20%|██████████████████████████████████████▋                                                                                                                                                         | 1145/5680 [3:10:15<9:56:50,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '1.198', 'grad_norm': '0.3167', 'learning_rate': '0.0001806', 'ppl': '3.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 9379840, 'tokens/trainable': 9295531, 'epoch': '1.086'}
 20%|██████████████████████████████████████▋                                                                                                                                                         | 1145/5680 [3:10:15<9:56:50,  7.90s/it] 20%|██████████████████████████████████████▋                                                                                                                                                         | 1146/5680 [3:10:23<9:57:37,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7217', 'grad_norm': '0.2344', 'learning_rate': '0.0001806', 'ppl': '2.058', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9388032, 'tokens/trainable': 9303680, 'epoch': '1.086'}
 20%|██████████████████████████████████████▋                                                                                                                                                         | 1146/5680 [3:10:23<9:57:37,  7.91s/it] 20%|██████████████████████████████████████▊                                                                                                                                                         | 1147/5680 [3:10:31<9:57:02,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6769', 'grad_norm': '0.2089', 'learning_rate': '0.0001806', 'ppl': '1.968', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9396224, 'tokens/trainable': 9311776, 'epoch': '1.086'}
 20%|██████████████████████████████████████▊                                                                                                                                                         | 1147/5680 [3:10:31<9:57:02,  7.90s/it] 20%|██████████████████████████████████████▊                                                                                                                                                         | 1148/5680 [3:10:38<9:55:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7398', 'grad_norm': '0.3155', 'learning_rate': '0.0001805', 'ppl': '2.096', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9404416, 'tokens/trainable': 9319878, 'epoch': '1.086'}
 20%|██████████████████████████████████████▊                                                                                                                                                         | 1148/5680 [3:10:38<9:55:44,  7.89s/it] 20%|██████████████████████████████████████▊                                                                                                                                                         | 1149/5680 [3:10:46<9:54:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5946', 'grad_norm': '0.2494', 'learning_rate': '0.0001805', 'ppl': '1.812', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 9412608, 'tokens/trainable': 9328019, 'epoch': '1.086'}
 20%|██████████████████████████████████████▊                                                                                                                                                         | 1149/5680 [3:10:46<9:54:31,  7.87s/it] 20%|██████████████████████████████████████▊                                                                                                                                                         | 1150/5680 [3:10:54<9:53:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '1.027', 'grad_norm': '0.302', 'learning_rate': '0.0001805', 'ppl': '2.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 9420800, 'tokens/trainable': 9336116, 'epoch': '1.086'}
 20%|██████████████████████████████████████▊                                                                                                                                                         | 1150/5680 [3:10:54<9:53:25,  7.86s/it] 20%|██████████████████████████████████████▉                                                                                                                                                         | 1151/5680 [3:11:02<9:52:52,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.855', 'grad_norm': '0.3042', 'learning_rate': '0.0001804', 'ppl': '2.351', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 9428992, 'tokens/trainable': 9344261, 'epoch': '1.087'}
 20%|██████████████████████████████████████▉                                                                                                                                                         | 1151/5680 [3:11:02<9:52:52,  7.85s/it] 20%|██████████████████████████████████████▉                                                                                                                                                         | 1152/5680 [3:11:10<9:53:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6055', 'grad_norm': '0.2346', 'learning_rate': '0.0001804', 'ppl': '1.832', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9437184, 'tokens/trainable': 9352445, 'epoch': '1.087'}
 20%|██████████████████████████████████████▉                                                                                                                                                         | 1152/5680 [3:11:10<9:53:51,  7.87s/it] 20%|██████████████████████████████████████▉                                                                                                                                                         | 1153/5680 [3:11:18<9:54:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7683', 'grad_norm': '0.2758', 'learning_rate': '0.0001804', 'ppl': '2.156', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 9445376, 'tokens/trainable': 9360622, 'epoch': '1.087'}
 20%|██████████████████████████████████████▉                                                                                                                                                         | 1153/5680 [3:11:18<9:54:41,  7.88s/it] 20%|███████████████████████████████████████                                                                                                                                                         | 1154/5680 [3:11:26<9:54:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6692', 'grad_norm': '0.2664', 'learning_rate': '0.0001803', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 9453568, 'tokens/trainable': 9368612, 'epoch': '1.087'}
 20%|███████████████████████████████████████                                                                                                                                                         | 1154/5680 [3:11:26<9:54:40,  7.88s/it] 20%|███████████████████████████████████████                                                                                                                                                         | 1155/5680 [3:11:34<9:53:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4342', 'grad_norm': '0.1992', 'learning_rate': '0.0001803', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 9461760, 'tokens/trainable': 9376742, 'epoch': '1.087'}
 20%|███████████████████████████████████████                                                                                                                                                         | 1155/5680 [3:11:34<9:53:57,  7.88s/it] 20%|███████████████████████████████████████                                                                                                                                                         | 1156/5680 [3:11:41<9:53:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8146', 'grad_norm': '0.2565', 'learning_rate': '0.0001803', 'ppl': '2.258', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9469952, 'tokens/trainable': 9384896, 'epoch': '1.087'}
 20%|███████████████████████████████████████                                                                                                                                                         | 1156/5680 [3:11:41<9:53:45,  7.87s/it] 20%|███████████████████████████████████████                                                                                                                                                         | 1157/5680 [3:11:50<9:59:39,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7597', 'grad_norm': '0.2548', 'learning_rate': '0.0001802', 'ppl': '2.138', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999', 'tokens/total': 9478144, 'tokens/trainable': 9393028, 'epoch': '1.088'}
 20%|███████████████████████████████████████                                                                                                                                                         | 1157/5680 [3:11:50<9:59:39,  7.95s/it] 20%|███████████████████████████████████████▏                                                                                                                                                        | 1158/5680 [3:11:57<9:57:33,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5469', 'grad_norm': '0.2377', 'learning_rate': '0.0001802', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 9486336, 'tokens/trainable': 9401190, 'epoch': '1.088'}
 20%|███████████████████████████████████████▏                                                                                                                                                        | 1158/5680 [3:11:57<9:57:33,  7.93s/it] 20%|███████████████████████████████████████▏                                                                                                                                                        | 1159/5680 [3:12:05<9:55:42,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5463', 'grad_norm': '0.2111', 'learning_rate': '0.0001802', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 9494528, 'tokens/trainable': 9409356, 'epoch': '1.088'}
 20%|███████████████████████████████████████▏                                                                                                                                                        | 1159/5680 [3:12:05<9:55:42,  7.91s/it] 20%|███████████████████████████████████████▏                                                                                                                                                        | 1160/5680 [3:12:13<9:54:47,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4617', 'grad_norm': '0.1966', 'learning_rate': '0.0001801', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 9502720, 'tokens/trainable': 9417376, 'epoch': '1.088'}
 20%|███████████████████████████████████████▏                                                                                                                                                        | 1160/5680 [3:12:13<9:54:47,  7.90s/it] 20%|███████████████████████████████████████▏                                                                                                                                                        | 1161/5680 [3:12:21<9:54:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6014', 'grad_norm': '0.2169', 'learning_rate': '0.0001801', 'ppl': '1.825', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 9510912, 'tokens/trainable': 9425404, 'epoch': '1.088'}
 20%|███████████████████████████████████████▏                                                                                                                                                        | 1161/5680 [3:12:21<9:54:07,  7.89s/it] 20%|███████████████████████████████████████▎                                                                                                                                                        | 1162/5680 [3:12:29<9:53:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5931', 'grad_norm': '0.2211', 'learning_rate': '0.0001801', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 9519104, 'tokens/trainable': 9433532, 'epoch': '1.089'}
 20%|███████████████████████████████████████▎                                                                                                                                                        | 1162/5680 [3:12:29<9:53:15,  7.88s/it] 20%|███████████████████████████████████████▎                                                                                                                                                        | 1163/5680 [3:12:37<9:53:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6096', 'grad_norm': '0.2238', 'learning_rate': '0.00018', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 9527296, 'tokens/trainable': 9441579, 'epoch': '1.089'}
 20%|███████████████████████████████████████▎                                                                                                                                                        | 1163/5680 [3:12:37<9:53:08,  7.88s/it] 20%|███████████████████████████████████████▎                                                                                                                                                        | 1164/5680 [3:12:45<9:52:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.742', 'grad_norm': '0.2265', 'learning_rate': '0.00018', 'ppl': '2.1', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 9535488, 'tokens/trainable': 9449758, 'epoch': '1.089'}
 20%|███████████████████████████████████████▎                                                                                                                                                        | 1164/5680 [3:12:45<9:52:20,  7.87s/it] 21%|███████████████████████████████████████▍                                                                                                                                                        | 1165/5680 [3:12:52<9:51:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4783', 'grad_norm': '0.1975', 'learning_rate': '0.00018', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 9543680, 'tokens/trainable': 9457775, 'epoch': '1.089'}
 21%|███████████████████████████████████████▍                                                                                                                                                        | 1165/5680 [3:12:52<9:51:25,  7.86s/it] 21%|███████████████████████████████████████▍                                                                                                                                                        | 1166/5680 [3:13:00<9:51:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6358', 'grad_norm': '0.236', 'learning_rate': '0.0001799', 'ppl': '1.889', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9551872, 'tokens/trainable': 9465914, 'epoch': '1.089'}
 21%|███████████████████████████████████████▍                                                                                                                                                        | 1166/5680 [3:13:00<9:51:41,  7.86s/it] 21%|███████████████████████████████████████▍                                                                                                                                                        | 1167/5680 [3:13:08<9:50:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8812', 'grad_norm': '0.2251', 'learning_rate': '0.0001799', 'ppl': '2.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 9560064, 'tokens/trainable': 9474083, 'epoch': '1.089'}
 21%|███████████████████████████████████████▍                                                                                                                                                        | 1167/5680 [3:13:08<9:50:50,  7.86s/it] 21%|███████████████████████████████████████▍                                                                                                                                                        | 1168/5680 [3:13:16<9:51:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7333', 'grad_norm': '0.241', 'learning_rate': '0.0001799', 'ppl': '2.082', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9568256, 'tokens/trainable': 9482241, 'epoch': '1.09'}
 21%|███████████████████████████████████████▍                                                                                                                                                        | 1168/5680 [3:13:16<9:51:13,  7.86s/it] 21%|███████████████████████████████████████▌                                                                                                                                                        | 1169/5680 [3:13:24<9:52:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5091', 'grad_norm': '0.2271', 'learning_rate': '0.0001798', 'ppl': '1.664', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9576448, 'tokens/trainable': 9490379, 'epoch': '1.09'}
 21%|███████████████████████████████████████▌                                                                                                                                                        | 1169/5680 [3:13:24<9:52:35,  7.88s/it] 21%|███████████████████████████████████████▌                                                                                                                                                        | 1170/5680 [3:13:32<9:51:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8348', 'grad_norm': '0.2583', 'learning_rate': '0.0001798', 'ppl': '2.304', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 9584640, 'tokens/trainable': 9498474, 'epoch': '1.09'}
 21%|███████████████████████████████████████▌                                                                                                                                                        | 1170/5680 [3:13:32<9:51:16,  7.87s/it] 21%|███████████████████████████████████████▌                                                                                                                                                        | 1171/5680 [3:13:40<9:51:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7121', 'grad_norm': '0.2802', 'learning_rate': '0.0001798', 'ppl': '2.038', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 9592832, 'tokens/trainable': 9506517, 'epoch': '1.09'}
 21%|███████████████████████████████████████▌                                                                                                                                                        | 1171/5680 [3:13:40<9:51:47,  7.87s/it] 21%|███████████████████████████████████████▌                                                                                                                                                        | 1172/5680 [3:13:48<9:50:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8776', 'grad_norm': '0.2435', 'learning_rate': '0.0001797', 'ppl': '2.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 9601024, 'tokens/trainable': 9514690, 'epoch': '1.09'}
 21%|███████████████████████████████████████▌                                                                                                                                                        | 1172/5680 [3:13:48<9:50:24,  7.86s/it] 21%|███████████████████████████████████████▋                                                                                                                                                        | 1173/5680 [3:13:55<9:50:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6149', 'grad_norm': '0.2292', 'learning_rate': '0.0001797', 'ppl': '1.849', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 9609216, 'tokens/trainable': 9522781, 'epoch': '1.09'}
 21%|███████████████████████████████████████▋                                                                                                                                                        | 1173/5680 [3:13:55<9:50:46,  7.86s/it] 21%|███████████████████████████████████████▋                                                                                                                                                        | 1174/5680 [3:14:03<9:49:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7674', 'grad_norm': '0.256', 'learning_rate': '0.0001797', 'ppl': '2.154', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 9617408, 'tokens/trainable': 9530745, 'epoch': '1.091'}
 21%|███████████████████████████████████████▋                                                                                                                                                        | 1174/5680 [3:14:03<9:49:26,  7.85s/it] 21%|███████████████████████████████████████▋                                                                                                                                                        | 1175/5680 [3:14:11<9:49:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6325', 'grad_norm': '0.246', 'learning_rate': '0.0001796', 'ppl': '1.882', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 9625600, 'tokens/trainable': 9538842, 'epoch': '1.091'}
 21%|███████████████████████████████████████▋                                                                                                                                                        | 1175/5680 [3:14:11<9:49:38,  7.85s/it] 21%|███████████████████████████████████████▊                                                                                                                                                        | 1176/5680 [3:14:19<9:51:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4242', 'grad_norm': '0.1942', 'learning_rate': '0.0001796', 'ppl': '1.528', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 9633792, 'tokens/trainable': 9546999, 'epoch': '1.091'}
 21%|███████████████████████████████████████▊                                                                                                                                                        | 1176/5680 [3:14:19<9:51:27,  7.88s/it] 21%|███████████████████████████████████████▊                                                                                                                                                        | 1177/5680 [3:14:27<9:51:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5808', 'grad_norm': '0.2271', 'learning_rate': '0.0001796', 'ppl': '1.787', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9641984, 'tokens/trainable': 9555126, 'epoch': '1.091'}
 21%|███████████████████████████████████████▊                                                                                                                                                        | 1177/5680 [3:14:27<9:51:13,  7.88s/it] 21%|███████████████████████████████████████▊                                                                                                                                                        | 1178/5680 [3:14:35<9:50:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.556', 'grad_norm': '0.2064', 'learning_rate': '0.0001795', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 9650176, 'tokens/trainable': 9563137, 'epoch': '1.091'}
 21%|███████████████████████████████████████▊                                                                                                                                                        | 1178/5680 [3:14:35<9:50:01,  7.86s/it] 21%|███████████████████████████████████████▊                                                                                                                                                        | 1179/5680 [3:14:43<9:49:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7155', 'grad_norm': '0.2662', 'learning_rate': '0.0001795', 'ppl': '2.045', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 9658368, 'tokens/trainable': 9571171, 'epoch': '1.092'}
 21%|███████████████████████████████████████▊                                                                                                                                                        | 1179/5680 [3:14:43<9:49:53,  7.86s/it] 21%|███████████████████████████████████████▉                                                                                                                                                        | 1180/5680 [3:14:50<9:49:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7921', 'grad_norm': '0.2725', 'learning_rate': '0.0001795', 'ppl': '2.208', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 9666560, 'tokens/trainable': 9579157, 'epoch': '1.092'}
 21%|███████████████████████████████████████▉                                                                                                                                                        | 1180/5680 [3:14:50<9:49:32,  7.86s/it] 21%|███████████████████████████████████████▉                                                                                                                                                        | 1181/5680 [3:14:58<9:49:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6622', 'grad_norm': '0.241', 'learning_rate': '0.0001794', 'ppl': '1.939', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 9674752, 'tokens/trainable': 9587332, 'epoch': '1.092'}
 21%|███████████████████████████████████████▉                                                                                                                                                        | 1181/5680 [3:14:58<9:49:33,  7.86s/it] 21%|███████████████████████████████████████▉                                                                                                                                                        | 1182/5680 [3:15:06<9:48:33,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6636', 'grad_norm': '0.2272', 'learning_rate': '0.0001794', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 9682944, 'tokens/trainable': 9595318, 'epoch': '1.092'}
 21%|███████████████████████████████████████▉                                                                                                                                                        | 1182/5680 [3:15:06<9:48:33,  7.85s/it] 21%|███████████████████████████████████████▉                                                                                                                                                        | 1183/5680 [3:15:14<9:48:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.9204', 'grad_norm': '0.2785', 'learning_rate': '0.0001794', 'ppl': '2.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 9691136, 'tokens/trainable': 9603175, 'epoch': '1.092'}
 21%|███████████████████████████████████████▉                                                                                                                                                        | 1183/5680 [3:15:14<9:48:37,  7.85s/it] 21%|████████████████████████████████████████                                                                                                                                                        | 1184/5680 [3:15:22<9:48:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8727', 'grad_norm': '0.2535', 'learning_rate': '0.0001793', 'ppl': '2.393', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 9699328, 'tokens/trainable': 9611256, 'epoch': '1.092'}
 21%|████████████████████████████████████████                                                                                                                                                        | 1184/5680 [3:15:22<9:48:39,  7.86s/it] 21%|████████████████████████████████████████                                                                                                                                                        | 1185/5680 [3:15:30<9:49:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7047', 'grad_norm': '0.2705', 'learning_rate': '0.0001793', 'ppl': '2.023', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 9707520, 'tokens/trainable': 9619379, 'epoch': '1.093'}
 21%|████████████████████████████████████████                                                                                                                                                        | 1185/5680 [3:15:30<9:49:39,  7.87s/it] 21%|████████████████████████████████████████                                                                                                                                                        | 1186/5680 [3:15:38<9:49:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3883', 'grad_norm': '0.1962', 'learning_rate': '0.0001793', 'ppl': '1.474', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 9715712, 'tokens/trainable': 9627494, 'epoch': '1.093'}
 21%|████████████████████████████████████████                                                                                                                                                        | 1186/5680 [3:15:38<9:49:37,  7.87s/it] 21%|████████████████████████████████████████                                                                                                                                                        | 1187/5680 [3:15:46<9:50:34,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5494', 'grad_norm': '0.2292', 'learning_rate': '0.0001792', 'ppl': '1.732', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 9723904, 'tokens/trainable': 9635450, 'epoch': '1.093'}
 21%|████████████████████████████████████████                                                                                                                                                        | 1187/5680 [3:15:46<9:50:34,  7.89s/it] 21%|████████████████████████████████████████▏                                                                                                                                                       | 1188/5680 [3:15:54<9:56:21,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.8101', 'grad_norm': '0.2532', 'learning_rate': '0.0001792', 'ppl': '2.248', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 9732096, 'tokens/trainable': 9643604, 'epoch': '1.093'}
 21%|████████████████████████████████████████▏                                                                                                                                                       | 1188/5680 [3:15:54<9:56:21,  7.97s/it] 21%|████████████████████████████████████████▏                                                                                                                                                       | 1189/5680 [3:16:02<9:53:18,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7386', 'grad_norm': '0.2499', 'learning_rate': '0.0001792', 'ppl': '2.093', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 9740288, 'tokens/trainable': 9651786, 'epoch': '1.093'}
 21%|████████████████████████████████████████▏                                                                                                                                                       | 1189/5680 [3:16:02<9:53:18,  7.93s/it] 21%|████████████████████████████████████████▏                                                                                                                                                       | 1190/5680 [3:16:09<9:51:50,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6098', 'grad_norm': '0.2056', 'learning_rate': '0.0001791', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 9748480, 'tokens/trainable': 9659812, 'epoch': '1.093'}
 21%|████████████████████████████████████████▏                                                                                                                                                       | 1190/5680 [3:16:09<9:51:50,  7.91s/it] 21%|████████████████████████████████████████▎                                                                                                                                                       | 1191/5680 [3:16:17<9:50:52,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8561', 'grad_norm': '0.3222', 'learning_rate': '0.0001791', 'ppl': '2.354', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 9756672, 'tokens/trainable': 9667976, 'epoch': '1.094'}
 21%|████████████████████████████████████████▎                                                                                                                                                       | 1191/5680 [3:16:17<9:50:52,  7.90s/it] 21%|████████████████████████████████████████▎                                                                                                                                                       | 1192/5680 [3:16:25<9:49:55,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6335', 'grad_norm': '0.2395', 'learning_rate': '0.0001791', 'ppl': '1.884', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 9764864, 'tokens/trainable': 9676117, 'epoch': '1.094'}
 21%|████████████████████████████████████████▎                                                                                                                                                       | 1192/5680 [3:16:25<9:49:55,  7.89s/it] 21%|████████████████████████████████████████▎                                                                                                                                                       | 1193/5680 [3:16:33<9:49:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9611', 'grad_norm': '0.2732', 'learning_rate': '0.000179', 'ppl': '2.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9773056, 'tokens/trainable': 9684245, 'epoch': '1.094'}
 21%|████████████████████████████████████████▎                                                                                                                                                       | 1193/5680 [3:16:33<9:49:37,  7.88s/it] 21%|████████████████████████████████████████▎                                                                                                                                                       | 1194/5680 [3:16:41<9:48:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6101', 'grad_norm': '0.2586', 'learning_rate': '0.000179', 'ppl': '1.841', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 9781248, 'tokens/trainable': 9692236, 'epoch': '1.094'}
 21%|████████████████████████████████████████▎                                                                                                                                                       | 1194/5680 [3:16:41<9:48:46,  7.87s/it] 21%|████████████████████████████████████████▍                                                                                                                                                       | 1195/5680 [3:16:49<9:48:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5686', 'grad_norm': '0.2456', 'learning_rate': '0.000179', 'ppl': '1.766', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 9789440, 'tokens/trainable': 9700279, 'epoch': '1.094'}
 21%|████████████████████████████████████████▍                                                                                                                                                       | 1195/5680 [3:16:49<9:48:26,  7.87s/it] 21%|████████████████████████████████████████▍                                                                                                                                                       | 1196/5680 [3:16:57<9:48:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8001', 'grad_norm': '0.2585', 'learning_rate': '0.0001789', 'ppl': '2.226', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 9797632, 'tokens/trainable': 9708267, 'epoch': '1.095'}
 21%|████████████████████████████████████████▍                                                                                                                                                       | 1196/5680 [3:16:57<9:48:03,  7.87s/it] 21%|████████████████████████████████████████▍                                                                                                                                                       | 1197/5680 [3:17:04<9:48:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.536', 'grad_norm': '0.2507', 'learning_rate': '0.0001789', 'ppl': '1.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 9805824, 'tokens/trainable': 9716338, 'epoch': '1.095'}
 21%|████████████████████████████████████████▍                                                                                                                                                       | 1197/5680 [3:17:04<9:48:09,  7.87s/it] 21%|████████████████████████████████████████▍                                                                                                                                                       | 1198/5680 [3:17:12<9:47:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9255', 'grad_norm': '0.2792', 'learning_rate': '0.0001789', 'ppl': '2.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 9814016, 'tokens/trainable': 9724355, 'epoch': '1.095'}
 21%|████████████████████████████████████████▍                                                                                                                                                       | 1198/5680 [3:17:12<9:47:50,  7.87s/it] 21%|████████████████████████████████████████▌                                                                                                                                                       | 1199/5680 [3:17:20<9:48:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6581', 'grad_norm': '0.2378', 'learning_rate': '0.0001788', 'ppl': '1.931', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 9822208, 'tokens/trainable': 9732478, 'epoch': '1.095'}
 21%|████████████████████████████████████████▌                                                                                                                                                       | 1199/5680 [3:17:20<9:48:05,  7.87s/it] 21%|████████████████████████████████████████▌                                                                                                                                                       | 1200/5680 [3:17:28<9:48:03,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5536', 'grad_norm': '0.2337', 'learning_rate': '0.0001788', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 9830400, 'tokens/trainable': 9740508, 'epoch': '1.095'}
 21%|████████████████████████████████████████▌                                                                                                                                                       | 1200/5680 [3:17:28<9:48:03,  7.88s/it] 21%|████████████████████████████████████████▌                                                                                                                                                       | 1201/5680 [3:17:36<9:48:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6263', 'grad_norm': '0.2701', 'learning_rate': '0.0001788', 'ppl': '1.871', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 9838592, 'tokens/trainable': 9748543, 'epoch': '1.095'}
 21%|████████████████████████████████████████▌                                                                                                                                                       | 1201/5680 [3:17:36<9:48:20,  7.88s/it] 21%|████████████████████████████████████████▋                                                                                                                                                       | 1202/5680 [3:17:44<9:47:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8845', 'grad_norm': '0.2669', 'learning_rate': '0.0001787', 'ppl': '2.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 9846784, 'tokens/trainable': 9756545, 'epoch': '1.096'}
 21%|████████████████████████████████████████▋                                                                                                                                                       | 1202/5680 [3:17:44<9:47:26,  7.87s/it] 21%|████████████████████████████████████████▋                                                                                                                                                       | 1203/5680 [3:17:52<9:46:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6644', 'grad_norm': '0.2296', 'learning_rate': '0.0001787', 'ppl': '1.943', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 9854976, 'tokens/trainable': 9764719, 'epoch': '1.096'}
 21%|████████████████████████████████████████▋                                                                                                                                                       | 1203/5680 [3:17:52<9:46:22,  7.86s/it] 21%|████████████████████████████████████████▋                                                                                                                                                       | 1204/5680 [3:18:00<9:48:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7603', 'grad_norm': '0.2867', 'learning_rate': '0.0001787', 'ppl': '2.139', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 9863168, 'tokens/trainable': 9772766, 'epoch': '1.096'}
 21%|████████████████████████████████████████▋                                                                                                                                                       | 1204/5680 [3:18:00<9:48:05,  7.88s/it] 21%|████████████████████████████████████████▋                                                                                                                                                       | 1205/5680 [3:18:07<9:47:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8096', 'grad_norm': '0.273', 'learning_rate': '0.0001786', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 9871360, 'tokens/trainable': 9780750, 'epoch': '1.096'}
 21%|████████████████████████████████████████▋                                                                                                                                                       | 1205/5680 [3:18:07<9:47:52,  7.88s/it] 21%|████████████████████████████████████████▊                                                                                                                                                       | 1206/5680 [3:18:15<9:48:55,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8753', 'grad_norm': '0.267', 'learning_rate': '0.0001786', 'ppl': '2.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 9879552, 'tokens/trainable': 9788810, 'epoch': '1.096'}
 21%|████████████████████████████████████████▊                                                                                                                                                       | 1206/5680 [3:18:15<9:48:55,  7.90s/it] 21%|████████████████████████████████████████▊                                                                                                                                                       | 1207/5680 [3:18:23<9:49:18,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5821', 'grad_norm': '0.2407', 'learning_rate': '0.0001786', 'ppl': '1.79', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 9887744, 'tokens/trainable': 9796844, 'epoch': '1.096'}
 21%|████████████████████████████████████████▊                                                                                                                                                       | 1207/5680 [3:18:23<9:49:18,  7.90s/it] 21%|████████████████████████████████████████▊                                                                                                                                                       | 1208/5680 [3:18:31<9:49:03,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7676', 'grad_norm': '0.2295', 'learning_rate': '0.0001785', 'ppl': '2.155', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 9895936, 'tokens/trainable': 9804845, 'epoch': '1.097'}
 21%|████████████████████████████████████████▊                                                                                                                                                       | 1208/5680 [3:18:31<9:49:03,  7.90s/it] 21%|████████████████████████████████████████▊                                                                                                                                                       | 1209/5680 [3:18:39<9:49:39,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6789', 'grad_norm': '0.238', 'learning_rate': '0.0001785', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 9904128, 'tokens/trainable': 9812865, 'epoch': '1.097'}
 21%|████████████████████████████████████████▊                                                                                                                                                       | 1209/5680 [3:18:39<9:49:39,  7.91s/it] 21%|████████████████████████████████████████▉                                                                                                                                                       | 1210/5680 [3:18:47<9:50:17,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5063', 'grad_norm': '0.2403', 'learning_rate': '0.0001785', 'ppl': '1.659', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 9912320, 'tokens/trainable': 9821046, 'epoch': '1.097'}
 21%|████████████████████████████████████████▉                                                                                                                                                       | 1210/5680 [3:18:47<9:50:17,  7.92s/it] 21%|████████████████████████████████████████▉                                                                                                                                                       | 1211/5680 [3:18:55<9:50:35,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6982', 'grad_norm': '0.2411', 'learning_rate': '0.0001784', 'ppl': '2.01', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 9920512, 'tokens/trainable': 9829071, 'epoch': '1.097'}
 21%|████████████████████████████████████████▉                                                                                                                                                       | 1211/5680 [3:18:55<9:50:35,  7.93s/it] 21%|████████████████████████████████████████▉                                                                                                                                                       | 1212/5680 [3:19:03<9:50:14,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7891', 'grad_norm': '0.2273', 'learning_rate': '0.0001784', 'ppl': '2.201', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 9928704, 'tokens/trainable': 9837122, 'epoch': '1.097'}
 21%|████████████████████████████████████████▉                                                                                                                                                       | 1212/5680 [3:19:03<9:50:14,  7.93s/it] 21%|█████████████████████████████████████████                                                                                                                                                       | 1213/5680 [3:19:11<9:50:48,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6017', 'grad_norm': '0.2405', 'learning_rate': '0.0001784', 'ppl': '1.825', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 9936896, 'tokens/trainable': 9845238, 'epoch': '1.098'}
 21%|█████████████████████████████████████████                                                                                                                                                       | 1213/5680 [3:19:11<9:50:48,  7.94s/it] 21%|█████████████████████████████████████████                                                                                                                                                       | 1214/5680 [3:19:19<9:51:05,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.9972', 'grad_norm': '0.2869', 'learning_rate': '0.0001783', 'ppl': '2.711', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 9945088, 'tokens/trainable': 9853340, 'epoch': '1.098'}
 21%|█████████████████████████████████████████                                                                                                                                                       | 1214/5680 [3:19:19<9:51:05,  7.94s/it] 21%|█████████████████████████████████████████                                                                                                                                                       | 1215/5680 [3:19:27<9:52:19,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5573', 'grad_norm': '0.2596', 'learning_rate': '0.0001783', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.9', 'tokens/total': 9953280, 'tokens/trainable': 9861236, 'epoch': '1.098'}
 21%|█████████████████████████████████████████                                                                                                                                                       | 1215/5680 [3:19:27<9:52:19,  7.96s/it] 21%|█████████████████████████████████████████                                                                                                                                                       | 1216/5680 [3:19:35<9:51:42,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5445', 'grad_norm': '0.2228', 'learning_rate': '0.0001783', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 9961472, 'tokens/trainable': 9869385, 'epoch': '1.098'}
 21%|█████████████████████████████████████████                                                                                                                                                       | 1216/5680 [3:19:35<9:51:42,  7.95s/it] 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1217/5680 [3:19:43<9:50:44,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8521', 'grad_norm': '0.2776', 'learning_rate': '0.0001782', 'ppl': '2.345', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 9969664, 'tokens/trainable': 9877400, 'epoch': '1.098'}
 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1217/5680 [3:19:43<9:50:44,  7.94s/it] 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1218/5680 [3:19:51<9:48:53,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.7208', 'grad_norm': '0.2546', 'learning_rate': '0.0001782', 'ppl': '2.056', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 9977856, 'tokens/trainable': 9885518, 'epoch': '1.098'}
 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1218/5680 [3:19:51<9:48:53,  7.92s/it] 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1219/5680 [3:19:58<9:47:39,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6681', 'grad_norm': '0.2593', 'learning_rate': '0.0001782', 'ppl': '1.951', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 9986048, 'tokens/trainable': 9893580, 'epoch': '1.099'}
 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1219/5680 [3:19:58<9:47:39,  7.90s/it] 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1220/5680 [3:20:06<9:46:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.9113', 'grad_norm': '0.283', 'learning_rate': '0.0001781', 'ppl': '2.488', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 9994240, 'tokens/trainable': 9901592, 'epoch': '1.099'}
 21%|█████████████████████████████████████████▏                                                                                                                                                      | 1220/5680 [3:20:06<9:46:50,  7.89s/it] 21%|█████████████████████████████████████████▎                                                                                                                                                      | 1221/5680 [3:20:14<9:46:08,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7248', 'grad_norm': '0.271', 'learning_rate': '0.0001781', 'ppl': '2.064', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 10002432, 'tokens/trainable': 9909688, 'epoch': '1.099'}
 21%|█████████████████████████████████████████▎                                                                                                                                                      | 1221/5680 [3:20:14<9:46:08,  7.89s/it] 22%|█████████████████████████████████████████▎                                                                                                                                                      | 1222/5680 [3:20:22<9:46:12,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4524', 'grad_norm': '0.2044', 'learning_rate': '0.000178', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.1', 'tokens/total': 10010624, 'tokens/trainable': 9917354, 'epoch': '1.099'}
 22%|█████████████████████████████████████████▎                                                                                                                                                      | 1222/5680 [3:20:22<9:46:12,  7.89s/it] 22%|█████████████████████████████████████████▎                                                                                                                                                      | 1223/5680 [3:20:30<9:45:16,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5605', 'grad_norm': '0.2376', 'learning_rate': '0.000178', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.9', 'tokens/total': 10018816, 'tokens/trainable': 9925009, 'epoch': '1.099'}
 22%|█████████████████████████████████████████▎                                                                                                                                                      | 1223/5680 [3:20:30<9:45:16,  7.88s/it] 22%|█████████████████████████████████████████▎                                                                                                                                                      | 1224/5680 [3:20:38<9:44:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6705', 'grad_norm': '0.2262', 'learning_rate': '0.000178', 'ppl': '1.955', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 10027008, 'tokens/trainable': 9933186, 'epoch': '1.099'}
 22%|█████████████████████████████████████████▎                                                                                                                                                      | 1224/5680 [3:20:38<9:44:21,  7.87s/it] 22%|█████████████████████████████████████████▍                                                                                                                                                      | 1225/5680 [3:20:46<9:43:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7683', 'grad_norm': '0.2508', 'learning_rate': '0.0001779', 'ppl': '2.156', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.9', 'tokens/total': 10035200, 'tokens/trainable': 9941029, 'epoch': '1.1'}
 22%|█████████████████████████████████████████▍                                                                                                                                                      | 1225/5680 [3:20:46<9:43:43,  7.86s/it] 22%|█████████████████████████████████████████▍                                                                                                                                                      | 1226/5680 [3:20:54<9:44:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4562', 'grad_norm': '0.2009', 'learning_rate': '0.0001779', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 10043392, 'tokens/trainable': 9948968, 'epoch': '1.1'}
 22%|█████████████████████████████████████████▍                                                                                                                                                      | 1226/5680 [3:20:54<9:44:09,  7.87s/it] 22%|█████████████████████████████████████████▍                                                                                                                                                      | 1227/5680 [3:21:01<9:43:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7179', 'grad_norm': '0.2618', 'learning_rate': '0.0001779', 'ppl': '2.05', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 10051584, 'tokens/trainable': 9957068, 'epoch': '1.1'}
 22%|█████████████████████████████████████████▍                                                                                                                                                      | 1227/5680 [3:21:01<9:43:45,  7.87s/it] 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1228/5680 [3:21:09<9:42:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.9286', 'grad_norm': '0.3282', 'learning_rate': '0.0001778', 'ppl': '2.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 10059776, 'tokens/trainable': 9965069, 'epoch': '1.1'}
 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1228/5680 [3:21:09<9:42:23,  7.85s/it] 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1229/5680 [3:21:17<9:42:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4544', 'grad_norm': '0.2148', 'learning_rate': '0.0001778', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.3', 'tokens/total': 10067968, 'tokens/trainable': 9972777, 'epoch': '1.1'}
 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1229/5680 [3:21:17<9:42:58,  7.86s/it] 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1230/5680 [3:21:25<9:42:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7583', 'grad_norm': '0.2405', 'learning_rate': '0.0001778', 'ppl': '2.135', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 10076160, 'tokens/trainable': 9980704, 'epoch': '1.101'}
 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1230/5680 [3:21:25<9:42:25,  7.85s/it] 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1231/5680 [3:21:33<9:42:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.694', 'grad_norm': '0.2749', 'learning_rate': '0.0001777', 'ppl': '2.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 10084352, 'tokens/trainable': 9988641, 'epoch': '1.101'}
 22%|█████████████████████████████████████████▌                                                                                                                                                      | 1231/5680 [3:21:33<9:42:49,  7.86s/it] 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1232/5680 [3:21:41<9:42:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6627', 'grad_norm': '0.2436', 'learning_rate': '0.0001777', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.5', 'tokens/total': 10092544, 'tokens/trainable': 9996466, 'epoch': '1.101'}
 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1232/5680 [3:21:41<9:42:23,  7.86s/it] 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1233/5680 [3:21:49<9:41:56,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7406', 'grad_norm': '0.2457', 'learning_rate': '0.0001777', 'ppl': '2.097', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 10100736, 'tokens/trainable': 10004501, 'epoch': '1.101'}
 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1233/5680 [3:21:49<9:41:56,  7.85s/it] 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1234/5680 [3:21:56<9:41:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.694', 'grad_norm': '0.2534', 'learning_rate': '0.0001776', 'ppl': '2.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 10108928, 'tokens/trainable': 10012622, 'epoch': '1.101'}
 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1234/5680 [3:21:56<9:41:53,  7.85s/it] 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1235/5680 [3:22:04<9:40:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.9195', 'grad_norm': '0.2643', 'learning_rate': '0.0001776', 'ppl': '2.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 10117120, 'tokens/trainable': 10020609, 'epoch': '1.101'}
 22%|█████████████████████████████████████████▋                                                                                                                                                      | 1235/5680 [3:22:04<9:40:36,  7.84s/it] 22%|█████████████████████████████████████████▊                                                                                                                                                      | 1236/5680 [3:22:12<9:39:55,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.9844', 'grad_norm': '0.2485', 'learning_rate': '0.0001776', 'ppl': '2.676', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 10125312, 'tokens/trainable': 10028521, 'epoch': '1.102'}
 22%|█████████████████████████████████████████▊                                                                                                                                                      | 1236/5680 [3:22:12<9:39:55,  7.83s/it] 22%|█████████████████████████████████████████▊                                                                                                                                                      | 1237/5680 [3:22:20<9:40:27,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6682', 'grad_norm': '0.2137', 'learning_rate': '0.0001775', 'ppl': '1.951', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 10133504, 'tokens/trainable': 10036524, 'epoch': '1.102'}
 22%|█████████████████████████████████████████▊                                                                                                                                                      | 1237/5680 [3:22:20<9:40:27,  7.84s/it] 22%|█████████████████████████████████████████▊                                                                                                                                                      | 1238/5680 [3:22:28<9:40:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4662', 'grad_norm': '0.1915', 'learning_rate': '0.0001775', 'ppl': '1.594', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 10141696, 'tokens/trainable': 10044486, 'epoch': '1.102'}
 22%|█████████████████████████████████████████▊                                                                                                                                                      | 1238/5680 [3:22:28<9:40:36,  7.84s/it] 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1239/5680 [3:22:36<9:42:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6247', 'grad_norm': '0.2544', 'learning_rate': '0.0001775', 'ppl': '1.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 10149888, 'tokens/trainable': 10052639, 'epoch': '1.102'}
 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1239/5680 [3:22:36<9:42:15,  7.87s/it] 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1240/5680 [3:22:43<9:41:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.9965', 'grad_norm': '0.2654', 'learning_rate': '0.0001774', 'ppl': '2.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 10158080, 'tokens/trainable': 10060762, 'epoch': '1.102'}
 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1240/5680 [3:22:43<9:41:24,  7.86s/it] 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1241/5680 [3:22:51<9:41:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '1.03', 'grad_norm': '0.249', 'learning_rate': '0.0001774', 'ppl': '2.802', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 10166272, 'tokens/trainable': 10068636, 'epoch': '1.102'}
 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1241/5680 [3:22:51<9:41:21,  7.86s/it] 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1242/5680 [3:22:59<9:41:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6552', 'grad_norm': '0.2332', 'learning_rate': '0.0001774', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 10174464, 'tokens/trainable': 10076711, 'epoch': '1.103'}
 22%|█████████████████████████████████████████▉                                                                                                                                                      | 1242/5680 [3:22:59<9:41:04,  7.86s/it] 22%|██████████████████████████████████████████                                                                                                                                                      | 1243/5680 [3:23:07<9:42:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5301', 'grad_norm': '0.2055', 'learning_rate': '0.0001773', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 10182656, 'tokens/trainable': 10084733, 'epoch': '1.103'}
 22%|██████████████████████████████████████████                                                                                                                                                      | 1243/5680 [3:23:07<9:42:05,  7.87s/it] 22%|██████████████████████████████████████████                                                                                                                                                      | 1244/5680 [3:23:15<9:40:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6127', 'grad_norm': '0.2236', 'learning_rate': '0.0001773', 'ppl': '1.845', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 10190848, 'tokens/trainable': 10092732, 'epoch': '1.103'}
 22%|██████████████████████████████████████████                                                                                                                                                      | 1244/5680 [3:23:15<9:40:55,  7.86s/it] 22%|██████████████████████████████████████████                                                                                                                                                      | 1245/5680 [3:23:23<9:40:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8703', 'grad_norm': '0.2756', 'learning_rate': '0.0001772', 'ppl': '2.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 10199040, 'tokens/trainable': 10100911, 'epoch': '1.103'}
 22%|██████████████████████████████████████████                                                                                                                                                      | 1245/5680 [3:23:23<9:40:48,  7.86s/it] 22%|██████████████████████████████████████████                                                                                                                                                      | 1246/5680 [3:23:31<9:40:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7173', 'grad_norm': '0.2633', 'learning_rate': '0.0001772', 'ppl': '2.049', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.1', 'tokens/total': 10207232, 'tokens/trainable': 10108733, 'epoch': '1.103'}
 22%|██████████████████████████████████████████                                                                                                                                                      | 1246/5680 [3:23:31<9:40:26,  7.85s/it] 22%|██████████████████████████████████████████▏                                                                                                                                                     | 1247/5680 [3:23:38<9:40:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8272', 'grad_norm': '0.276', 'learning_rate': '0.0001772', 'ppl': '2.287', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.5', 'tokens/total': 10215424, 'tokens/trainable': 10116559, 'epoch': '1.104'}
 22%|██████████████████████████████████████████▏                                                                                                                                                     | 1247/5680 [3:23:38<9:40:30,  7.86s/it] 22%|██████████████████████████████████████████▏                                                                                                                                                     | 1248/5680 [3:23:46<9:40:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.832', 'grad_norm': '0.2524', 'learning_rate': '0.0001771', 'ppl': '2.298', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998', 'tokens/total': 10223616, 'tokens/trainable': 10124415, 'epoch': '1.104'}
 22%|██████████████████████████████████████████▏                                                                                                                                                     | 1248/5680 [3:23:46<9:40:44,  7.86s/it] 22%|██████████████████████████████████████████▏                                                                                                                                                     | 1249/5680 [3:23:54<9:40:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5801', 'grad_norm': '0.2315', 'learning_rate': '0.0001771', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 10231808, 'tokens/trainable': 10132545, 'epoch': '1.104'}
 22%|██████████████████████████████████████████▏                                                                                                                                                     | 1249/5680 [3:23:54<9:40:46,  7.86s/it] 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1250/5680 [3:24:02<9:40:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6097', 'grad_norm': '0.2753', 'learning_rate': '0.0001771', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.9', 'tokens/total': 10240000, 'tokens/trainable': 10140380, 'epoch': '1.104'}
 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1250/5680 [3:24:02<9:40:35,  7.86s/it] 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1251/5680 [3:24:10<9:40:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5409', 'grad_norm': '0.2241', 'learning_rate': '0.000177', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 10248192, 'tokens/trainable': 10148270, 'epoch': '1.104'}
 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1251/5680 [3:24:10<9:40:05,  7.86s/it] 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1252/5680 [3:24:18<9:39:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7062', 'grad_norm': '0.2724', 'learning_rate': '0.000177', 'ppl': '2.026', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '956.7', 'tokens/total': 10256384, 'tokens/trainable': 10155781, 'epoch': '1.104'}
 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1252/5680 [3:24:18<9:39:49,  7.86s/it] 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1253/5680 [3:24:26<9:40:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5514', 'grad_norm': '0.2294', 'learning_rate': '0.000177', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.6', 'tokens/total': 10264576, 'tokens/trainable': 10163455, 'epoch': '1.105'}
 22%|██████████████████████████████████████████▎                                                                                                                                                     | 1253/5680 [3:24:26<9:40:39,  7.87s/it] 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1254/5680 [3:24:34<9:41:03,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6761', 'grad_norm': '0.2541', 'learning_rate': '0.0001769', 'ppl': '1.966', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 10272768, 'tokens/trainable': 10171584, 'epoch': '1.105'}
 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1254/5680 [3:24:34<9:41:03,  7.88s/it] 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1255/5680 [3:24:41<9:40:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6146', 'grad_norm': '0.2499', 'learning_rate': '0.0001769', 'ppl': '1.849', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 10280960, 'tokens/trainable': 10179592, 'epoch': '1.105'}
 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1255/5680 [3:24:41<9:40:42,  7.87s/it] 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1256/5680 [3:24:49<9:40:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6497', 'grad_norm': '0.2375', 'learning_rate': '0.0001769', 'ppl': '1.915', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 10289152, 'tokens/trainable': 10187765, 'epoch': '1.105'}
 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1256/5680 [3:24:49<9:40:22,  7.87s/it] 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1257/5680 [3:24:57<9:47:18,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5123', 'grad_norm': '0.2496', 'learning_rate': '0.0001768', 'ppl': '1.669', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '931.4', 'tokens/total': 10297344, 'tokens/trainable': 10195392, 'epoch': '1.105'}
 22%|██████████████████████████████████████████▍                                                                                                                                                     | 1257/5680 [3:24:57<9:47:18,  7.97s/it] 22%|██████████████████████████████████████████▌                                                                                                                                                     | 1258/5680 [3:25:05<9:45:58,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6659', 'grad_norm': '0.2457', 'learning_rate': '0.0001768', 'ppl': '1.946', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 10305536, 'tokens/trainable': 10203362, 'epoch': '1.105'}
 22%|██████████████████████████████████████████▌                                                                                                                                                     | 1258/5680 [3:25:05<9:45:58,  7.95s/it] 22%|██████████████████████████████████████████▌                                                                                                                                                     | 1259/5680 [3:25:13<9:45:11,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6943', 'grad_norm': '0.2791', 'learning_rate': '0.0001768', 'ppl': '2.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '976.7', 'tokens/total': 10313728, 'tokens/trainable': 10211097, 'epoch': '1.106'}
 22%|██████████████████████████████████████████▌                                                                                                                                                     | 1259/5680 [3:25:13<9:45:11,  7.94s/it] 22%|██████████████████████████████████████████▌                                                                                                                                                     | 1260/5680 [3:25:21<9:43:03,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6334', 'grad_norm': '0.2191', 'learning_rate': '0.0001767', 'ppl': '1.884', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 10321920, 'tokens/trainable': 10219034, 'epoch': '1.106'}
 22%|██████████████████████████████████████████▌                                                                                                                                                     | 1260/5680 [3:25:21<9:43:03,  7.91s/it] 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1261/5680 [3:25:29<9:41:29,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '1.064', 'grad_norm': '0.2849', 'learning_rate': '0.0001767', 'ppl': '2.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 10330112, 'tokens/trainable': 10227126, 'epoch': '1.106'}
 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1261/5680 [3:25:29<9:41:29,  7.90s/it] 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1262/5680 [3:25:37<9:41:03,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6987', 'grad_norm': '0.2711', 'learning_rate': '0.0001766', 'ppl': '2.011', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 10338304, 'tokens/trainable': 10235020, 'epoch': '1.106'}
 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1262/5680 [3:25:37<9:41:03,  7.89s/it] 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1263/5680 [3:25:45<9:40:57,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7924', 'grad_norm': '0.2552', 'learning_rate': '0.0001766', 'ppl': '2.209', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 10346496, 'tokens/trainable': 10243132, 'epoch': '1.106'}
 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1263/5680 [3:25:45<9:40:57,  7.89s/it] 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1264/5680 [3:25:53<9:40:28,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8466', 'grad_norm': '0.3283', 'learning_rate': '0.0001766', 'ppl': '2.332', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 10354688, 'tokens/trainable': 10251104, 'epoch': '1.107'}
 22%|██████████████████████████████████████████▋                                                                                                                                                     | 1264/5680 [3:25:53<9:40:28,  7.89s/it] 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1265/5680 [3:26:01<9:40:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6333', 'grad_norm': '0.2518', 'learning_rate': '0.0001765', 'ppl': '1.884', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 10362880, 'tokens/trainable': 10259109, 'epoch': '1.107'}
 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1265/5680 [3:26:01<9:40:11,  7.88s/it] 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1266/5680 [3:26:08<9:39:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6919', 'grad_norm': '0.2365', 'learning_rate': '0.0001765', 'ppl': '1.997', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '969.5', 'tokens/total': 10371072, 'tokens/trainable': 10266722, 'epoch': '1.107'}
 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1266/5680 [3:26:08<9:39:23,  7.88s/it] 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1267/5680 [3:26:16<9:38:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6556', 'grad_norm': '0.2205', 'learning_rate': '0.0001765', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 10379264, 'tokens/trainable': 10274717, 'epoch': '1.107'}
 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1267/5680 [3:26:16<9:38:56,  7.87s/it] 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1268/5680 [3:26:24<9:38:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6313', 'grad_norm': '0.2708', 'learning_rate': '0.0001764', 'ppl': '1.88', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 10387456, 'tokens/trainable': 10282793, 'epoch': '1.107'}
 22%|██████████████████████████████████████████▊                                                                                                                                                     | 1268/5680 [3:26:24<9:38:43,  7.87s/it] 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1269/5680 [3:26:32<9:38:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8327', 'grad_norm': '0.2751', 'learning_rate': '0.0001764', 'ppl': '2.3', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 10395648, 'tokens/trainable': 10290796, 'epoch': '1.107'}
 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1269/5680 [3:26:32<9:38:08,  7.86s/it] 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1270/5680 [3:26:40<9:38:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4267', 'grad_norm': '0.2095', 'learning_rate': '0.0001764', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '969.2', 'tokens/total': 10403840, 'tokens/trainable': 10298448, 'epoch': '1.108'}
 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1270/5680 [3:26:40<9:38:44,  7.87s/it] 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1271/5680 [3:26:48<9:38:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7901', 'grad_norm': '0.2641', 'learning_rate': '0.0001763', 'ppl': '2.204', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 10412032, 'tokens/trainable': 10306387, 'epoch': '1.108'}
 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1271/5680 [3:26:48<9:38:05,  7.87s/it] 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1272/5680 [3:26:56<9:37:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5496', 'grad_norm': '0.2285', 'learning_rate': '0.0001763', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.1', 'tokens/total': 10420224, 'tokens/trainable': 10314129, 'epoch': '1.108'}
 22%|██████████████████████████████████████████▉                                                                                                                                                     | 1272/5680 [3:26:56<9:37:39,  7.86s/it] 22%|███████████████████████████████████████████                                                                                                                                                     | 1273/5680 [3:27:03<9:38:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5627', 'grad_norm': '0.2131', 'learning_rate': '0.0001763', 'ppl': '1.755', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 10428416, 'tokens/trainable': 10322217, 'epoch': '1.108'}
 22%|███████████████████████████████████████████                                                                                                                                                     | 1273/5680 [3:27:03<9:38:03,  7.87s/it] 22%|███████████████████████████████████████████                                                                                                                                                     | 1274/5680 [3:27:11<9:37:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8344', 'grad_norm': '0.2383', 'learning_rate': '0.0001762', 'ppl': '2.303', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 10436608, 'tokens/trainable': 10330185, 'epoch': '1.108'}
 22%|███████████████████████████████████████████                                                                                                                                                     | 1274/5680 [3:27:11<9:37:44,  7.87s/it] 22%|███████████████████████████████████████████                                                                                                                                                     | 1275/5680 [3:27:19<9:37:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6172', 'grad_norm': '0.2375', 'learning_rate': '0.0001762', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 10444800, 'tokens/trainable': 10338071, 'epoch': '1.108'}
 22%|███████████████████████████████████████████                                                                                                                                                     | 1275/5680 [3:27:19<9:37:07,  7.86s/it] 22%|███████████████████████████████████████████▏                                                                                                                                                    | 1276/5680 [3:27:27<9:37:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7248', 'grad_norm': '0.2597', 'learning_rate': '0.0001761', 'ppl': '2.064', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 10452992, 'tokens/trainable': 10346186, 'epoch': '1.109'}
 22%|███████████████████████████████████████████▏                                                                                                                                                    | 1276/5680 [3:27:27<9:37:29,  7.87s/it] 22%|███████████████████████████████████████████▏                                                                                                                                                    | 1277/5680 [3:27:35<9:38:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.477', 'grad_norm': '0.2465', 'learning_rate': '0.0001761', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '956', 'tokens/total': 10461184, 'tokens/trainable': 10353741, 'epoch': '1.109'}
 22%|███████████████████████████████████████████▏                                                                                                                                                    | 1277/5680 [3:27:35<9:38:11,  7.88s/it] 22%|███████████████████████████████████████████▏                                                                                                                                                    | 1278/5680 [3:27:43<9:37:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8075', 'grad_norm': '0.3113', 'learning_rate': '0.0001761', 'ppl': '2.242', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 10469376, 'tokens/trainable': 10361797, 'epoch': '1.109'}
 22%|███████████████████████████████████████████▏                                                                                                                                                    | 1278/5680 [3:27:43<9:37:18,  7.87s/it] 23%|███████████████████████████████████████████▏                                                                                                                                                    | 1279/5680 [3:27:51<9:38:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5278', 'grad_norm': '0.2426', 'learning_rate': '0.000176', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 10477568, 'tokens/trainable': 10369742, 'epoch': '1.109'}
 23%|███████████████████████████████████████████▏                                                                                                                                                    | 1279/5680 [3:27:51<9:38:08,  7.88s/it] 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1280/5680 [3:27:59<9:37:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5586', 'grad_norm': '0.2165', 'learning_rate': '0.000176', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 10485760, 'tokens/trainable': 10377866, 'epoch': '1.109'}
 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1280/5680 [3:27:59<9:37:22,  7.87s/it] 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1281/5680 [3:28:06<9:36:41,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6222', 'grad_norm': '0.2315', 'learning_rate': '0.000176', 'ppl': '1.863', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 10493952, 'tokens/trainable': 10385755, 'epoch': '1.11'}
 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1281/5680 [3:28:06<9:36:41,  7.87s/it] 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1282/5680 [3:28:14<9:36:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7183', 'grad_norm': '0.2717', 'learning_rate': '0.0001759', 'ppl': '2.051', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.5', 'tokens/total': 10502144, 'tokens/trainable': 10393596, 'epoch': '1.11'}
 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1282/5680 [3:28:14<9:36:08,  7.86s/it] 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1283/5680 [3:28:22<9:36:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6226', 'grad_norm': '0.2671', 'learning_rate': '0.0001759', 'ppl': '1.864', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '968.2', 'tokens/total': 10510336, 'tokens/trainable': 10401210, 'epoch': '1.11'}
 23%|███████████████████████████████████████████▎                                                                                                                                                    | 1283/5680 [3:28:22<9:36:07,  7.86s/it] 23%|███████████████████████████████████████████▍                                                                                                                                                    | 1284/5680 [3:28:30<9:36:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5874', 'grad_norm': '0.2196', 'learning_rate': '0.0001759', 'ppl': '1.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 10518528, 'tokens/trainable': 10409356, 'epoch': '1.11'}
 23%|███████████████████████████████████████████▍                                                                                                                                                    | 1284/5680 [3:28:30<9:36:03,  7.86s/it] 23%|███████████████████████████████████████████▍                                                                                                                                                    | 1285/5680 [3:28:38<9:35:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6203', 'grad_norm': '0.2348', 'learning_rate': '0.0001758', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 10526720, 'tokens/trainable': 10417421, 'epoch': '1.11'}
 23%|███████████████████████████████████████████▍                                                                                                                                                    | 1285/5680 [3:28:38<9:35:28,  7.86s/it] 23%|███████████████████████████████████████████▍                                                                                                                                                    | 1286/5680 [3:28:46<9:35:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7748', 'grad_norm': '0.2745', 'learning_rate': '0.0001758', 'ppl': '2.17', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.8', 'tokens/total': 10534912, 'tokens/trainable': 10425266, 'epoch': '1.11'}
 23%|███████████████████████████████████████████▍                                                                                                                                                    | 1286/5680 [3:28:46<9:35:51,  7.86s/it] 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1287/5680 [3:28:54<9:36:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6587', 'grad_norm': '0.2704', 'learning_rate': '0.0001758', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '955.5', 'tokens/total': 10543104, 'tokens/trainable': 10432817, 'epoch': '1.111'}
 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1287/5680 [3:28:54<9:36:37,  7.88s/it] 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1288/5680 [3:29:01<9:35:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.95', 'grad_norm': '0.2616', 'learning_rate': '0.0001757', 'ppl': '2.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '973.3', 'tokens/total': 10551296, 'tokens/trainable': 10440450, 'epoch': '1.111'}
 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1288/5680 [3:29:01<9:35:48,  7.87s/it] 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1289/5680 [3:29:09<9:34:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8971', 'grad_norm': '0.2916', 'learning_rate': '0.0001757', 'ppl': '2.453', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 10559488, 'tokens/trainable': 10448393, 'epoch': '1.111'}
 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1289/5680 [3:29:09<9:34:28,  7.85s/it] 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1290/5680 [3:29:17<9:35:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6611', 'grad_norm': '0.2457', 'learning_rate': '0.0001756', 'ppl': '1.937', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983', 'tokens/total': 10567680, 'tokens/trainable': 10456147, 'epoch': '1.111'}
 23%|███████████████████████████████████████████▌                                                                                                                                                    | 1290/5680 [3:29:17<9:35:13,  7.86s/it] 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1291/5680 [3:29:25<9:34:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5222', 'grad_norm': '0.2243', 'learning_rate': '0.0001756', 'ppl': '1.686', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.6', 'tokens/total': 10575872, 'tokens/trainable': 10463828, 'epoch': '1.111'}
 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1291/5680 [3:29:25<9:34:41,  7.86s/it] 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1292/5680 [3:29:33<9:33:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7391', 'grad_norm': '0.2857', 'learning_rate': '0.0001756', 'ppl': '2.094', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 10584064, 'tokens/trainable': 10471966, 'epoch': '1.111'}
 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1292/5680 [3:29:33<9:33:59,  7.85s/it] 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1293/5680 [3:29:41<9:33:22,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5221', 'grad_norm': '0.2235', 'learning_rate': '0.0001755', 'ppl': '1.686', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.1', 'tokens/total': 10592256, 'tokens/trainable': 10479392, 'epoch': '1.112'}
 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1293/5680 [3:29:41<9:33:22,  7.84s/it] 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1294/5680 [3:29:49<9:34:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5339', 'grad_norm': '0.3254', 'learning_rate': '0.0001755', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 10600448, 'tokens/trainable': 10487478, 'epoch': '1.112'}
 23%|███████████████████████████████████████████▋                                                                                                                                                    | 1294/5680 [3:29:49<9:34:13,  7.86s/it] 23%|███████████████████████████████████████████▊                                                                                                                                                    | 1295/5680 [3:29:56<9:34:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4768', 'grad_norm': '0.2272', 'learning_rate': '0.0001755', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 10608640, 'tokens/trainable': 10495481, 'epoch': '1.112'}
 23%|███████████████████████████████████████████▊                                                                                                                                                    | 1295/5680 [3:29:56<9:34:07,  7.86s/it] 23%|███████████████████████████████████████████▊                                                                                                                                                    | 1296/5680 [3:30:04<9:34:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5451', 'grad_norm': '0.2233', 'learning_rate': '0.0001754', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 10616832, 'tokens/trainable': 10503600, 'epoch': '1.112'}
 23%|███████████████████████████████████████████▊                                                                                                                                                    | 1296/5680 [3:30:04<9:34:39,  7.86s/it] 23%|███████████████████████████████████████████▊                                                                                                                                                    | 1297/5680 [3:30:12<9:34:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.892', 'grad_norm': '0.3001', 'learning_rate': '0.0001754', 'ppl': '2.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.2', 'tokens/total': 10625024, 'tokens/trainable': 10511450, 'epoch': '1.112'}
 23%|███████████████████████████████████████████▊                                                                                                                                                    | 1297/5680 [3:30:12<9:34:33,  7.87s/it] 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1298/5680 [3:30:20<9:34:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5375', 'grad_norm': '0.2367', 'learning_rate': '0.0001754', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 10633216, 'tokens/trainable': 10519479, 'epoch': '1.113'}
 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1298/5680 [3:30:20<9:34:36,  7.87s/it] 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1299/5680 [3:30:28<9:34:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.976', 'grad_norm': '0.2756', 'learning_rate': '0.0001753', 'ppl': '2.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 10641408, 'tokens/trainable': 10527543, 'epoch': '1.113'}
 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1299/5680 [3:30:28<9:34:19,  7.87s/it] 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1300/5680 [3:30:36<9:33:05,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5279', 'grad_norm': '0.2214', 'learning_rate': '0.0001753', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 10649600, 'tokens/trainable': 10535701, 'epoch': '1.113'}
 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1300/5680 [3:30:36<9:33:05,  7.85s/it] 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1301/5680 [3:30:44<9:33:05,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6612', 'grad_norm': '0.2403', 'learning_rate': '0.0001752', 'ppl': '1.937', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.8', 'tokens/total': 10657792, 'tokens/trainable': 10543279, 'epoch': '1.113'}
 23%|███████████████████████████████████████████▉                                                                                                                                                    | 1301/5680 [3:30:44<9:33:05,  7.85s/it] 23%|████████████████████████████████████████████                                                                                                                                                    | 1302/5680 [3:30:51<9:33:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4431', 'grad_norm': '0.2119', 'learning_rate': '0.0001752', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 10665984, 'tokens/trainable': 10551360, 'epoch': '1.113'}
 23%|████████████████████████████████████████████                                                                                                                                                    | 1302/5680 [3:30:51<9:33:57,  7.87s/it] 23%|████████████████████████████████████████████                                                                                                                                                    | 1303/5680 [3:30:59<9:34:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5605', 'grad_norm': '0.2431', 'learning_rate': '0.0001752', 'ppl': '1.752', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 10674176, 'tokens/trainable': 10559247, 'epoch': '1.113'}
 23%|████████████████████████████████████████████                                                                                                                                                    | 1303/5680 [3:30:59<9:34:02,  7.87s/it] 23%|████████████████████████████████████████████                                                                                                                                                    | 1304/5680 [3:31:07<9:33:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8495', 'grad_norm': '0.2673', 'learning_rate': '0.0001751', 'ppl': '2.339', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 10682368, 'tokens/trainable': 10567085, 'epoch': '1.114'}
 23%|████████████████████████████████████████████                                                                                                                                                    | 1304/5680 [3:31:07<9:33:44,  7.87s/it] 23%|████████████████████████████████████████████                                                                                                                                                    | 1305/5680 [3:31:15<9:33:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.9106', 'grad_norm': '0.2437', 'learning_rate': '0.0001751', 'ppl': '2.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 10690560, 'tokens/trainable': 10575228, 'epoch': '1.114'}
 23%|████████████████████████████████████████████                                                                                                                                                    | 1305/5680 [3:31:15<9:33:15,  7.86s/it] 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1306/5680 [3:31:23<9:39:06,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7217', 'grad_norm': '0.2771', 'learning_rate': '0.0001751', 'ppl': '2.058', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.4', 'tokens/total': 10698752, 'tokens/trainable': 10583314, 'epoch': '1.114'}
 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1306/5680 [3:31:23<9:39:06,  7.94s/it] 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1307/5680 [3:31:31<9:37:36,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6974', 'grad_norm': '0.2355', 'learning_rate': '0.000175', 'ppl': '2.008', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 10706944, 'tokens/trainable': 10591356, 'epoch': '1.114'}
 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1307/5680 [3:31:31<9:37:36,  7.93s/it] 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1308/5680 [3:31:39<9:35:10,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5495', 'grad_norm': '0.2323', 'learning_rate': '0.000175', 'ppl': '1.732', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.9', 'tokens/total': 10715136, 'tokens/trainable': 10599150, 'epoch': '1.114'}
 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1308/5680 [3:31:39<9:35:10,  7.89s/it] 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1309/5680 [3:31:47<9:34:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8033', 'grad_norm': '0.2662', 'learning_rate': '0.000175', 'ppl': '2.233', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '961.7', 'tokens/total': 10723328, 'tokens/trainable': 10606697, 'epoch': '1.114'}
 23%|████████████████████████████████████████████▏                                                                                                                                                   | 1309/5680 [3:31:47<9:34:08,  7.88s/it] 23%|████████████████████████████████████████████▎                                                                                                                                                   | 1310/5680 [3:31:55<9:33:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6402', 'grad_norm': '0.2626', 'learning_rate': '0.0001749', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.1', 'tokens/total': 10731520, 'tokens/trainable': 10614485, 'epoch': '1.115'}
 23%|████████████████████████████████████████████▎                                                                                                                                                   | 1310/5680 [3:31:55<9:33:57,  7.88s/it] 23%|████████████████████████████████████████████▎                                                                                                                                                   | 1311/5680 [3:32:02<9:33:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.814', 'grad_norm': '0.2702', 'learning_rate': '0.0001749', 'ppl': '2.257', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 10739712, 'tokens/trainable': 10622389, 'epoch': '1.115'}
 23%|████████████████████████████████████████████▎                                                                                                                                                   | 1311/5680 [3:32:02<9:33:00,  7.87s/it] 23%|████████████████████████████████████████████▎                                                                                                                                                   | 1312/5680 [3:32:10<9:31:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.625', 'grad_norm': '0.2267', 'learning_rate': '0.0001748', 'ppl': '1.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 10747904, 'tokens/trainable': 10630539, 'epoch': '1.115'}
 23%|████████████████████████████████████████████▎                                                                                                                                                   | 1312/5680 [3:32:10<9:31:48,  7.85s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1313/5680 [3:32:18<9:31:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8305', 'grad_norm': '0.275', 'learning_rate': '0.0001748', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 10756096, 'tokens/trainable': 10638518, 'epoch': '1.115'}
 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1313/5680 [3:32:18<9:31:29,  7.85s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1314/5680 [3:32:26<9:32:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4305', 'grad_norm': '0.215', 'learning_rate': '0.0001748', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.2', 'tokens/total': 10764288, 'tokens/trainable': 10646390, 'epoch': '1.115'}
 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1314/5680 [3:32:26<9:32:29,  7.87s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1315/5680 [3:32:34<9:34:16,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.828', 'grad_norm': '0.3249', 'learning_rate': '0.0001747', 'ppl': '2.289', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.9', 'tokens/total': 10772480, 'tokens/trainable': 10654128, 'epoch': '1.115'}
 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1315/5680 [3:32:34<9:34:16,  7.89s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1316/5680 [3:32:42<9:35:16,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8308', 'grad_norm': '0.2611', 'learning_rate': '0.0001747', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 10780672, 'tokens/trainable': 10662122, 'epoch': '1.116'}
 23%|████████████████████████████████████████████▍                                                                                                                                                   | 1316/5680 [3:32:42<9:35:16,  7.91s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                   | 1317/5680 [3:32:50<9:33:45,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.726', 'grad_norm': '0.2494', 'learning_rate': '0.0001747', 'ppl': '2.067', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '950.9', 'tokens/total': 10788864, 'tokens/trainable': 10669581, 'epoch': '1.116'}
 23%|████████████████████████████████████████████▌                                                                                                                                                   | 1317/5680 [3:32:50<9:33:45,  7.89s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                   | 1318/5680 [3:32:58<9:33:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5751', 'grad_norm': '0.2935', 'learning_rate': '0.0001746', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '935.3', 'tokens/total': 10797056, 'tokens/trainable': 10676968, 'epoch': '1.116'}
 23%|████████████████████████████████████████████▌                                                                                                                                                   | 1318/5680 [3:32:58<9:33:50,  7.89s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                   | 1319/5680 [3:33:06<9:34:51,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7921', 'grad_norm': '0.3328', 'learning_rate': '0.0001746', 'ppl': '2.208', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '841.9', 'tokens/total': 10805248, 'tokens/trainable': 10683567, 'epoch': '1.116'}
 23%|████████████████████████████████████████████▌                                                                                                                                                   | 1319/5680 [3:33:06<9:34:51,  7.91s/it][2026-01-27 01:22:19,843] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:59360] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 01:22:21,134] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:59360] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:05<04:49, 19.26 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:05<02:09, 42.36 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:06<01:20, 66.50 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:06<00:55, 93.98 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:07<00:42, 119.98 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:07<00:34, 144.86 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:08<00:32, 153.63 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:08<00:25, 186.11 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:09<00:25, 188.00 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:09<00:22, 203.47 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:10<00:21, 212.47 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:10<00:20, 210.35 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:10<00:19, 217.87 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:11<00:19, 211.84 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:11<00:18, 222.61 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:12<00:17, 226.42 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:12<00:17, 225.14 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:13<00:17, 221.02 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:13<00:14, 246.48 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:14<00:14, 243.53 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:14<00:14, 235.79 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:14<00:13, 252.16 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:15<00:13, 245.90 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:15<00:12, 247.98 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:16<00:13, 230.37 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:16<00:13, 222.28 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:17<00:10, 266.56 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:17<00:11, 247.89 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:18<00:10, 241.82 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:18<00:10, 231.67 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:18<00:09, 246.62 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:19<00:09, 236.82 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:19<00:08, 250.24 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:20<00:08, 240.31 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:20<00:08, 243.95 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:20<00:07, 259.31 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:21<00:07, 247.86 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:21<00:06, 252.52 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:22<00:07, 213.52 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:22<00:05, 252.93 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:23<00:05, 240.03 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:23<00:05, 224.23 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:24<00:04, 244.54 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:24<00:04, 242.99 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:24<00:03, 243.62 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:25<00:03, 241.15 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:25<00:02, 247.98 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:26<00:02, 217.89 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:26<00:02, 233.89 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:27<00:01, 247.49 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:27<00:01, 246.00 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:28<00:00, 221.06 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:28<00:00, 255.42 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:28<00:00, 267.84 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:29<00:00, 191.01 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▌                                                                                                                                     | 1000/5677 [00:00<00:04, 1008.24 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1383.86 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1575.97 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:00, 1733.77 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1805.60 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1776.49 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1634.76 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1444.34 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2102.56 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2489.19 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2767.43 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:01<00:00, 2887.98 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:01<00:00, 2529.89 examples/s]
[2026-01-27 01:22:56,869] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:59360] Using single process for pack_parallel, running sequentially.
[2026-01-27 01:23:02,018] [WARNING] [py.warnings._showwarnmsg:109] [PID:59360] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1320/5680 [3:33:56<24:59:04, 20.63s/it]                                                                                                                                                                                                                                             {'loss': '0.7592', 'grad_norm': '0.2184', 'learning_rate': '0.0001745', 'ppl': '2.137', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 10813440, 'tokens/trainable': 10691721, 'epoch': '2'}
 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1320/5680 [3:33:56<24:59:04, 20.63s/it][2026-01-27 01:23:10,081] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:59599] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 01:23:11,161] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:59599] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:03<03:24, 27.28 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:32, 59.21 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                  | 318/5677 [00:04<00:52, 101.78 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:04<00:40, 128.82 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:30, 166.86 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:05<00:24, 202.48 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:05<00:21, 231.31 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:18, 256.99 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:06<00:17, 275.03 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:06<00:15, 293.25 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:14, 301.76 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:07<00:14, 311.07 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:07<00:11, 367.31 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:07<00:13, 307.26 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:08<00:11, 352.35 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:08<00:12, 309.17 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:08<00:12, 314.33 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:09<00:11, 320.54 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:09<00:11, 324.51 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:09<00:10, 328.01 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:10<00:10, 315.94 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:10<00:10, 333.02 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:10<00:09, 339.02 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:11<00:09, 335.77 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:11<00:09, 337.21 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:11<00:08, 336.69 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:12<00:08, 337.22 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:12<00:08, 338.22 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:12<00:07, 338.10 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:13<00:07, 336.49 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:13<00:07, 340.71 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:13<00:06, 341.84 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:13<00:06, 338.58 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:14<00:06, 341.28 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:14<00:05, 343.12 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:14<00:05, 342.30 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:15<00:05, 341.66 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:15<00:04, 344.42 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:15<00:04, 315.28 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:16<00:04, 348.53 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:16<00:04, 340.90 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:16<00:03, 330.87 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:17<00:03, 344.50 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:17<00:03, 341.42 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:17<00:02, 338.34 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:17<00:02, 338.83 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:18<00:02, 330.54 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:18<00:01, 331.34 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:18<00:01, 335.42 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:19<00:01, 336.82 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:19<00:00, 330.70 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:19<00:00, 361.87 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:20<00:00, 367.35 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:20<00:00, 348.04 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:20<00:00, 271.46 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:04, 999.05 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1404.36 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1567.48 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:00, 1704.38 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1750.87 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1790.66 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1629.57 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1386.34 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2101.66 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2483.13 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2747.83 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:01<00:00, 2856.73 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2498.76 examples/s]
[2026-01-27 01:23:43,315] [WARNING] [py.warnings._showwarnmsg:109] [PID:59599] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1321/5680 [3:34:37<32:28:48, 26.82s/it]                                                                                                                                                                                                                                             {'loss': '0.8015', 'grad_norm': '0.2419', 'learning_rate': '0.0001745', 'ppl': '2.229', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '198', 'tokens/total': 10821632, 'tokens/trainable': 10699894, 'epoch': '2'}
 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1321/5680 [3:34:37<32:28:48, 26.82s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1322/5680 [3:34:45<25:36:01, 21.15s/it]                                                                                                                                                                                                                                             {'loss': '0.5914', 'grad_norm': '0.2034', 'learning_rate': '0.0001745', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 10829824, 'tokens/trainable': 10708044, 'epoch': '2.001'}
 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1322/5680 [3:34:45<25:36:01, 21.15s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1323/5680 [3:34:53<20:47:25, 17.18s/it]                                                                                                                                                                                                                                             {'loss': '0.5193', 'grad_norm': '0.2269', 'learning_rate': '0.0001744', 'ppl': '1.681', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 10838016, 'tokens/trainable': 10716234, 'epoch': '2.001'}
 23%|████████████████████████████████████████████▍                                                                                                                                                  | 1323/5680 [3:34:53<20:47:25, 17.18s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1324/5680 [3:35:01<17:24:33, 14.39s/it]                                                                                                                                                                                                                                             {'loss': '0.5661', 'grad_norm': '0.2166', 'learning_rate': '0.0001744', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 10846208, 'tokens/trainable': 10724383, 'epoch': '2.001'}
 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1324/5680 [3:35:01<17:24:33, 14.39s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1325/5680 [3:35:09<15:02:47, 12.44s/it]                                                                                                                                                                                                                                             {'loss': '0.687', 'grad_norm': '0.2478', 'learning_rate': '0.0001744', 'ppl': '1.988', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 10854400, 'tokens/trainable': 10732559, 'epoch': '2.001'}
 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1325/5680 [3:35:09<15:02:47, 12.44s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1326/5680 [3:35:17<13:24:18, 11.08s/it]                                                                                                                                                                                                                                             {'loss': '0.697', 'grad_norm': '0.2618', 'learning_rate': '0.0001743', 'ppl': '2.008', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 10862592, 'tokens/trainable': 10740729, 'epoch': '2.001'}
 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1326/5680 [3:35:17<13:24:18, 11.08s/it] 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1327/5680 [3:35:25<12:14:06, 10.12s/it]                                                                                                                                                                                                                                             {'loss': '0.8826', 'grad_norm': '0.2818', 'learning_rate': '0.0001743', 'ppl': '2.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 10870784, 'tokens/trainable': 10748908, 'epoch': '2.001'}
 23%|████████████████████████████████████████████▌                                                                                                                                                  | 1327/5680 [3:35:25<12:14:06, 10.12s/it] 23%|████████████████████████████████████████████▋                                                                                                                                                  | 1328/5680 [3:35:32<11:24:42,  9.44s/it]                                                                                                                                                                                                                                             {'loss': '0.5724', 'grad_norm': '0.2587', 'learning_rate': '0.0001743', 'ppl': '1.773', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 10878976, 'tokens/trainable': 10757051, 'epoch': '2.002'}
 23%|████████████████████████████████████████████▋                                                                                                                                                  | 1328/5680 [3:35:32<11:24:42,  9.44s/it] 23%|████████████████████████████████████████████▋                                                                                                                                                  | 1329/5680 [3:35:40<10:49:56,  8.96s/it]                                                                                                                                                                                                                                             {'loss': '0.621', 'grad_norm': '0.2861', 'learning_rate': '0.0001742', 'ppl': '1.861', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 10887168, 'tokens/trainable': 10765226, 'epoch': '2.002'}
 23%|████████████████████████████████████████████▋                                                                                                                                                  | 1329/5680 [3:35:40<10:49:56,  8.96s/it] 23%|████████████████████████████████████████████▋                                                                                                                                                  | 1330/5680 [3:35:48<10:26:04,  8.64s/it]                                                                                                                                                                                                                                             {'loss': '1.04', 'grad_norm': '0.2645', 'learning_rate': '0.0001742', 'ppl': '2.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 10895360, 'tokens/trainable': 10773381, 'epoch': '2.002'}
 23%|████████████████████████████████████████████▋                                                                                                                                                  | 1330/5680 [3:35:48<10:26:04,  8.64s/it] 23%|████████████████████████████████████████████▊                                                                                                                                                  | 1331/5680 [3:35:56<10:09:31,  8.41s/it]                                                                                                                                                                                                                                             {'loss': '0.5785', 'grad_norm': '0.3005', 'learning_rate': '0.0001741', 'ppl': '1.783', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 10903552, 'tokens/trainable': 10781552, 'epoch': '2.002'}
 23%|████████████████████████████████████████████▊                                                                                                                                                  | 1331/5680 [3:35:56<10:09:31,  8.41s/it] 23%|█████████████████████████████████████████████                                                                                                                                                   | 1332/5680 [3:36:04<9:57:58,  8.25s/it]                                                                                                                                                                                                                                             {'loss': '0.8438', 'grad_norm': '0.267', 'learning_rate': '0.0001741', 'ppl': '2.325', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 10911744, 'tokens/trainable': 10789731, 'epoch': '2.002'}
 23%|█████████████████████████████████████████████                                                                                                                                                   | 1332/5680 [3:36:04<9:57:58,  8.25s/it] 23%|█████████████████████████████████████████████                                                                                                                                                   | 1333/5680 [3:36:12<9:49:52,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.8882', 'grad_norm': '0.2997', 'learning_rate': '0.0001741', 'ppl': '2.431', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 10919936, 'tokens/trainable': 10797887, 'epoch': '2.002'}
 23%|█████████████████████████████████████████████                                                                                                                                                   | 1333/5680 [3:36:12<9:49:52,  8.14s/it] 23%|█████████████████████████████████████████████                                                                                                                                                   | 1334/5680 [3:36:20<9:44:19,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6079', 'grad_norm': '0.2267', 'learning_rate': '0.000174', 'ppl': '1.836', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 10928128, 'tokens/trainable': 10806030, 'epoch': '2.003'}
 23%|█████████████████████████████████████████████                                                                                                                                                   | 1334/5680 [3:36:20<9:44:19,  8.07s/it] 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1335/5680 [3:36:28<9:40:24,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6446', 'grad_norm': '0.2423', 'learning_rate': '0.000174', 'ppl': '1.905', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 10936320, 'tokens/trainable': 10814221, 'epoch': '2.003'}
 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1335/5680 [3:36:28<9:40:24,  8.01s/it] 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1336/5680 [3:36:35<9:37:13,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6832', 'grad_norm': '0.2418', 'learning_rate': '0.000174', 'ppl': '1.98', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 10944512, 'tokens/trainable': 10822389, 'epoch': '2.003'}
 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1336/5680 [3:36:35<9:37:13,  7.97s/it] 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1337/5680 [3:36:43<9:35:28,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '1.003', 'grad_norm': '0.3945', 'learning_rate': '0.0001739', 'ppl': '2.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 10952704, 'tokens/trainable': 10830547, 'epoch': '2.003'}
 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1337/5680 [3:36:43<9:35:28,  7.95s/it] 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1338/5680 [3:36:51<9:34:49,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4374', 'grad_norm': '0.2484', 'learning_rate': '0.0001739', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 10960896, 'tokens/trainable': 10838718, 'epoch': '2.003'}
 24%|█████████████████████████████████████████████▏                                                                                                                                                  | 1338/5680 [3:36:51<9:34:49,  7.94s/it] 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1339/5680 [3:36:59<9:34:14,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7325', 'grad_norm': '0.3162', 'learning_rate': '0.0001738', 'ppl': '2.08', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 10969088, 'tokens/trainable': 10846890, 'epoch': '2.004'}
 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1339/5680 [3:36:59<9:34:14,  7.94s/it] 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1340/5680 [3:37:07<9:34:51,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.584', 'grad_norm': '0.2829', 'learning_rate': '0.0001738', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 10977280, 'tokens/trainable': 10855079, 'epoch': '2.004'}
 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1340/5680 [3:37:07<9:34:51,  7.95s/it] 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1341/5680 [3:37:15<9:32:29,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9588', 'grad_norm': '0.2967', 'learning_rate': '0.0001738', 'ppl': '2.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 10985472, 'tokens/trainable': 10863258, 'epoch': '2.004'}
 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1341/5680 [3:37:15<9:32:29,  7.92s/it] 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1342/5680 [3:37:23<9:31:34,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6762', 'grad_norm': '0.2495', 'learning_rate': '0.0001737', 'ppl': '1.966', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 10993664, 'tokens/trainable': 10871411, 'epoch': '2.004'}
 24%|█████████████████████████████████████████████▎                                                                                                                                                  | 1342/5680 [3:37:23<9:31:34,  7.91s/it] 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1343/5680 [3:37:31<9:30:26,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7669', 'grad_norm': '0.2736', 'learning_rate': '0.0001737', 'ppl': '2.153', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 11001856, 'tokens/trainable': 10879600, 'epoch': '2.004'}
 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1343/5680 [3:37:31<9:30:26,  7.89s/it] 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1344/5680 [3:37:39<9:29:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6622', 'grad_norm': '0.2247', 'learning_rate': '0.0001737', 'ppl': '1.939', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11010048, 'tokens/trainable': 10887749, 'epoch': '2.004'}
 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1344/5680 [3:37:39<9:29:24,  7.88s/it] 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1345/5680 [3:37:47<9:29:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8092', 'grad_norm': '0.2858', 'learning_rate': '0.0001736', 'ppl': '2.246', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11018240, 'tokens/trainable': 10895893, 'epoch': '2.005'}
 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1345/5680 [3:37:47<9:29:15,  7.88s/it] 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1346/5680 [3:37:54<9:29:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7954', 'grad_norm': '0.3006', 'learning_rate': '0.0001736', 'ppl': '2.215', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11026432, 'tokens/trainable': 10904069, 'epoch': '2.005'}
 24%|█████████████████████████████████████████████▍                                                                                                                                                  | 1346/5680 [3:37:54<9:29:20,  7.88s/it] 24%|█████████████████████████████████████████████▌                                                                                                                                                  | 1347/5680 [3:38:02<9:28:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.674', 'grad_norm': '0.2371', 'learning_rate': '0.0001735', 'ppl': '1.962', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11034624, 'tokens/trainable': 10912223, 'epoch': '2.005'}
 24%|█████████████████████████████████████████████▌                                                                                                                                                  | 1347/5680 [3:38:02<9:28:53,  7.88s/it] 24%|█████████████████████████████████████████████▌                                                                                                                                                  | 1348/5680 [3:38:10<9:28:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5953', 'grad_norm': '0.2528', 'learning_rate': '0.0001735', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 11042816, 'tokens/trainable': 10920333, 'epoch': '2.005'}
 24%|█████████████████████████████████████████████▌                                                                                                                                                  | 1348/5680 [3:38:10<9:28:57,  7.88s/it] 24%|█████████████████████████████████████████████▌                                                                                                                                                  | 1349/5680 [3:38:18<9:27:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7856', 'grad_norm': '0.2401', 'learning_rate': '0.0001735', 'ppl': '2.194', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 11051008, 'tokens/trainable': 10928515, 'epoch': '2.005'}
 24%|█████████████████████████████████████████████▌                                                                                                                                                  | 1349/5680 [3:38:18<9:27:44,  7.87s/it] 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1350/5680 [3:38:26<9:27:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.951', 'grad_norm': '0.3213', 'learning_rate': '0.0001734', 'ppl': '2.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 11059200, 'tokens/trainable': 10936653, 'epoch': '2.005'}
 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1350/5680 [3:38:26<9:27:29,  7.86s/it] 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1351/5680 [3:38:34<9:27:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.9311', 'grad_norm': '0.4224', 'learning_rate': '0.0001734', 'ppl': '2.537', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 11067392, 'tokens/trainable': 10944834, 'epoch': '2.006'}
 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1351/5680 [3:38:34<9:27:04,  7.86s/it] 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1352/5680 [3:38:42<9:26:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6034', 'grad_norm': '0.2542', 'learning_rate': '0.0001734', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 11075584, 'tokens/trainable': 10952933, 'epoch': '2.006'}
 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1352/5680 [3:38:42<9:26:42,  7.86s/it] 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1353/5680 [3:38:49<9:26:24,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6025', 'grad_norm': '0.2278', 'learning_rate': '0.0001733', 'ppl': '1.827', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 11083776, 'tokens/trainable': 10961115, 'epoch': '2.006'}
 24%|█████████████████████████████████████████████▋                                                                                                                                                  | 1353/5680 [3:38:49<9:26:24,  7.85s/it] 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1354/5680 [3:38:57<9:26:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5012', 'grad_norm': '0.2201', 'learning_rate': '0.0001733', 'ppl': '1.651', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 11091968, 'tokens/trainable': 10969271, 'epoch': '2.006'}
 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1354/5680 [3:38:57<9:26:16,  7.85s/it] 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1355/5680 [3:39:05<9:27:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7964', 'grad_norm': '0.2618', 'learning_rate': '0.0001732', 'ppl': '2.217', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 11100160, 'tokens/trainable': 10977426, 'epoch': '2.006'}
 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1355/5680 [3:39:05<9:27:11,  7.87s/it] 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1356/5680 [3:39:13<9:27:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9948', 'grad_norm': '0.299', 'learning_rate': '0.0001732', 'ppl': '2.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11108352, 'tokens/trainable': 10985591, 'epoch': '2.007'}
 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1356/5680 [3:39:13<9:27:34,  7.88s/it] 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1357/5680 [3:39:21<9:26:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6588', 'grad_norm': '0.2507', 'learning_rate': '0.0001732', 'ppl': '1.933', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11116544, 'tokens/trainable': 10993728, 'epoch': '2.007'}
 24%|█████████████████████████████████████████████▊                                                                                                                                                  | 1357/5680 [3:39:21<9:26:49,  7.87s/it] 24%|█████████████████████████████████████████████▉                                                                                                                                                  | 1358/5680 [3:39:29<9:32:39,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7598', 'grad_norm': '0.2362', 'learning_rate': '0.0001731', 'ppl': '2.138', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 11124736, 'tokens/trainable': 11001901, 'epoch': '2.007'}
 24%|█████████████████████████████████████████████▉                                                                                                                                                  | 1358/5680 [3:39:29<9:32:39,  7.95s/it] 24%|█████████████████████████████████████████████▉                                                                                                                                                  | 1359/5680 [3:39:37<9:29:48,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8747', 'grad_norm': '0.2431', 'learning_rate': '0.0001731', 'ppl': '2.398', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 11132928, 'tokens/trainable': 11010056, 'epoch': '2.007'}
 24%|█████████████████████████████████████████████▉                                                                                                                                                  | 1359/5680 [3:39:37<9:29:48,  7.91s/it] 24%|█████████████████████████████████████████████▉                                                                                                                                                  | 1360/5680 [3:39:45<9:28:15,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7074', 'grad_norm': '0.2463', 'learning_rate': '0.0001731', 'ppl': '2.029', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 11141120, 'tokens/trainable': 11018218, 'epoch': '2.007'}
 24%|█████████████████████████████████████████████▉                                                                                                                                                  | 1360/5680 [3:39:45<9:28:15,  7.89s/it] 24%|██████████████████████████████████████████████                                                                                                                                                  | 1361/5680 [3:39:53<9:29:07,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3527', 'grad_norm': '0.1737', 'learning_rate': '0.000173', 'ppl': '1.423', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 11149312, 'tokens/trainable': 11026368, 'epoch': '2.007'}
 24%|██████████████████████████████████████████████                                                                                                                                                  | 1361/5680 [3:39:53<9:29:07,  7.91s/it] 24%|██████████████████████████████████████████████                                                                                                                                                  | 1362/5680 [3:40:01<9:29:35,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5978', 'grad_norm': '0.2319', 'learning_rate': '0.000173', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 11157504, 'tokens/trainable': 11034558, 'epoch': '2.008'}
 24%|██████████████████████████████████████████████                                                                                                                                                  | 1362/5680 [3:40:01<9:29:35,  7.91s/it] 24%|██████████████████████████████████████████████                                                                                                                                                  | 1363/5680 [3:40:09<9:30:34,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '1.139', 'grad_norm': '0.3584', 'learning_rate': '0.0001729', 'ppl': '3.122', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 11165696, 'tokens/trainable': 11042746, 'epoch': '2.008'}
 24%|██████████████████████████████████████████████                                                                                                                                                  | 1363/5680 [3:40:09<9:30:34,  7.93s/it] 24%|██████████████████████████████████████████████                                                                                                                                                  | 1364/5680 [3:40:17<9:37:06,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5531', 'grad_norm': '0.2375', 'learning_rate': '0.0001729', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.2', 'tokens/total': 11173888, 'tokens/trainable': 11050886, 'epoch': '2.008'}
 24%|██████████████████████████████████████████████                                                                                                                                                  | 1364/5680 [3:40:17<9:37:06,  8.02s/it] 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1365/5680 [3:40:25<9:33:45,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.587', 'grad_norm': '0.242', 'learning_rate': '0.0001729', 'ppl': '1.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11182080, 'tokens/trainable': 11059001, 'epoch': '2.008'}
 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1365/5680 [3:40:25<9:33:45,  7.98s/it] 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1366/5680 [3:40:33<9:31:34,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7759', 'grad_norm': '0.2865', 'learning_rate': '0.0001728', 'ppl': '2.173', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 11190272, 'tokens/trainable': 11067186, 'epoch': '2.008'}
 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1366/5680 [3:40:33<9:31:34,  7.95s/it] 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1367/5680 [3:40:40<9:31:35,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4325', 'grad_norm': '0.2115', 'learning_rate': '0.0001728', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 11198464, 'tokens/trainable': 11075339, 'epoch': '2.008'}
 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1367/5680 [3:40:40<9:31:35,  7.95s/it] 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1368/5680 [3:40:48<9:30:55,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6065', 'grad_norm': '0.2571', 'learning_rate': '0.0001728', 'ppl': '1.834', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 11206656, 'tokens/trainable': 11083487, 'epoch': '2.009'}
 24%|██████████████████████████████████████████████▏                                                                                                                                                 | 1368/5680 [3:40:48<9:30:55,  7.94s/it] 24%|██████████████████████████████████████████████▎                                                                                                                                                 | 1369/5680 [3:40:56<9:29:01,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8097', 'grad_norm': '0.2758', 'learning_rate': '0.0001727', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11214848, 'tokens/trainable': 11091637, 'epoch': '2.009'}
 24%|██████████████████████████████████████████████▎                                                                                                                                                 | 1369/5680 [3:40:56<9:29:01,  7.92s/it] 24%|██████████████████████████████████████████████▎                                                                                                                                                 | 1370/5680 [3:41:04<9:28:43,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5535', 'grad_norm': '0.2586', 'learning_rate': '0.0001727', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11223040, 'tokens/trainable': 11099818, 'epoch': '2.009'}
 24%|██████████████████████████████████████████████▎                                                                                                                                                 | 1370/5680 [3:41:04<9:28:43,  7.92s/it] 24%|██████████████████████████████████████████████▎                                                                                                                                                 | 1371/5680 [3:41:12<9:28:38,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8502', 'grad_norm': '0.3106', 'learning_rate': '0.0001726', 'ppl': '2.34', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 11231232, 'tokens/trainable': 11107988, 'epoch': '2.009'}
 24%|██████████████████████████████████████████████▎                                                                                                                                                 | 1371/5680 [3:41:12<9:28:38,  7.92s/it] 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1372/5680 [3:41:20<9:29:10,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7088', 'grad_norm': '0.2462', 'learning_rate': '0.0001726', 'ppl': '2.032', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11239424, 'tokens/trainable': 11116166, 'epoch': '2.009'}
 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1372/5680 [3:41:20<9:29:10,  7.93s/it] 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1373/5680 [3:41:28<9:28:52,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6037', 'grad_norm': '0.2816', 'learning_rate': '0.0001726', 'ppl': '1.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 11247616, 'tokens/trainable': 11124287, 'epoch': '2.01'}
 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1373/5680 [3:41:28<9:28:52,  7.92s/it] 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1374/5680 [3:41:36<9:27:32,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7769', 'grad_norm': '0.2923', 'learning_rate': '0.0001725', 'ppl': '2.175', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 11255808, 'tokens/trainable': 11132386, 'epoch': '2.01'}
 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1374/5680 [3:41:36<9:27:32,  7.91s/it] 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1375/5680 [3:41:44<9:26:59,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.9034', 'grad_norm': '0.297', 'learning_rate': '0.0001725', 'ppl': '2.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 11264000, 'tokens/trainable': 11140525, 'epoch': '2.01'}
 24%|██████████████████████████████████████████████▍                                                                                                                                                 | 1375/5680 [3:41:44<9:26:59,  7.90s/it] 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1376/5680 [3:41:52<9:25:41,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7445', 'grad_norm': '0.2401', 'learning_rate': '0.0001724', 'ppl': '2.105', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 11272192, 'tokens/trainable': 11148619, 'epoch': '2.01'}
 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1376/5680 [3:41:52<9:25:41,  7.89s/it] 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1377/5680 [3:41:59<9:24:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9044', 'grad_norm': '0.2675', 'learning_rate': '0.0001724', 'ppl': '2.47', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 11280384, 'tokens/trainable': 11156765, 'epoch': '2.01'}
 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1377/5680 [3:41:59<9:24:29,  7.87s/it] 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1378/5680 [3:42:07<9:24:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6755', 'grad_norm': '0.2843', 'learning_rate': '0.0001724', 'ppl': '1.965', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11288576, 'tokens/trainable': 11164894, 'epoch': '2.01'}
 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1378/5680 [3:42:07<9:24:41,  7.88s/it] 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1379/5680 [3:42:15<9:25:45,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6926', 'grad_norm': '0.245', 'learning_rate': '0.0001723', 'ppl': '1.999', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 11296768, 'tokens/trainable': 11173047, 'epoch': '2.011'}
 24%|██████████████████████████████████████████████▌                                                                                                                                                 | 1379/5680 [3:42:15<9:25:45,  7.89s/it] 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1380/5680 [3:42:23<9:26:54,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8345', 'grad_norm': '0.2922', 'learning_rate': '0.0001723', 'ppl': '2.304', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 11304960, 'tokens/trainable': 11181214, 'epoch': '2.011'}
 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1380/5680 [3:42:23<9:26:54,  7.91s/it] 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1381/5680 [3:42:31<9:27:54,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6305', 'grad_norm': '0.2828', 'learning_rate': '0.0001723', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 11313152, 'tokens/trainable': 11189350, 'epoch': '2.011'}
 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1381/5680 [3:42:31<9:27:54,  7.93s/it] 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1382/5680 [3:42:39<9:28:24,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6276', 'grad_norm': '0.2843', 'learning_rate': '0.0001722', 'ppl': '1.873', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 11321344, 'tokens/trainable': 11197519, 'epoch': '2.011'}
 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1382/5680 [3:42:39<9:28:24,  7.93s/it] 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1383/5680 [3:42:47<9:28:36,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8038', 'grad_norm': '0.2635', 'learning_rate': '0.0001722', 'ppl': '2.234', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 11329536, 'tokens/trainable': 11205699, 'epoch': '2.011'}
 24%|██████████████████████████████████████████████▋                                                                                                                                                 | 1383/5680 [3:42:47<9:28:36,  7.94s/it] 24%|██████████████████████████████████████████████▊                                                                                                                                                 | 1384/5680 [3:42:55<9:28:56,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8771', 'grad_norm': '0.275', 'learning_rate': '0.0001721', 'ppl': '2.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 11337728, 'tokens/trainable': 11213852, 'epoch': '2.011'}
 24%|██████████████████████████████████████████████▊                                                                                                                                                 | 1384/5680 [3:42:55<9:28:56,  7.95s/it] 24%|██████████████████████████████████████████████▊                                                                                                                                                 | 1385/5680 [3:43:03<9:28:24,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5893', 'grad_norm': '0.228', 'learning_rate': '0.0001721', 'ppl': '1.803', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 11345920, 'tokens/trainable': 11221935, 'epoch': '2.012'}
 24%|██████████████████████████████████████████████▊                                                                                                                                                 | 1385/5680 [3:43:03<9:28:24,  7.94s/it] 24%|██████████████████████████████████████████████▊                                                                                                                                                 | 1386/5680 [3:43:11<9:28:25,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.9031', 'grad_norm': '0.2547', 'learning_rate': '0.0001721', 'ppl': '2.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11354112, 'tokens/trainable': 11230119, 'epoch': '2.012'}
 24%|██████████████████████████████████████████████▊                                                                                                                                                 | 1386/5680 [3:43:11<9:28:25,  7.94s/it] 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1387/5680 [3:43:19<9:29:32,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8094', 'grad_norm': '0.2573', 'learning_rate': '0.000172', 'ppl': '2.247', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 11362304, 'tokens/trainable': 11238253, 'epoch': '2.012'}
 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1387/5680 [3:43:19<9:29:32,  7.96s/it] 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1388/5680 [3:43:27<9:29:32,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7329', 'grad_norm': '0.2598', 'learning_rate': '0.000172', 'ppl': '2.081', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 11370496, 'tokens/trainable': 11246429, 'epoch': '2.012'}
 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1388/5680 [3:43:27<9:29:32,  7.96s/it] 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1389/5680 [3:43:35<9:29:38,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.8858', 'grad_norm': '0.2546', 'learning_rate': '0.000172', 'ppl': '2.425', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 11378688, 'tokens/trainable': 11254603, 'epoch': '2.012'}
 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1389/5680 [3:43:35<9:29:38,  7.97s/it] 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1390/5680 [3:43:43<9:29:17,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8257', 'grad_norm': '0.3145', 'learning_rate': '0.0001719', 'ppl': '2.284', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 11386880, 'tokens/trainable': 11262775, 'epoch': '2.013'}
 24%|██████████████████████████████████████████████▉                                                                                                                                                 | 1390/5680 [3:43:43<9:29:17,  7.96s/it] 24%|███████████████████████████████████████████████                                                                                                                                                 | 1391/5680 [3:43:51<9:29:30,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5857', 'grad_norm': '0.2336', 'learning_rate': '0.0001719', 'ppl': '1.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 11395072, 'tokens/trainable': 11270931, 'epoch': '2.013'}
 24%|███████████████████████████████████████████████                                                                                                                                                 | 1391/5680 [3:43:51<9:29:30,  7.97s/it] 25%|███████████████████████████████████████████████                                                                                                                                                 | 1392/5680 [3:43:59<9:28:42,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5151', 'grad_norm': '0.254', 'learning_rate': '0.0001718', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11403264, 'tokens/trainable': 11279104, 'epoch': '2.013'}
 25%|███████████████████████████████████████████████                                                                                                                                                 | 1392/5680 [3:43:59<9:28:42,  7.96s/it] 25%|███████████████████████████████████████████████                                                                                                                                                 | 1393/5680 [3:44:07<9:28:17,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8125', 'grad_norm': '0.3011', 'learning_rate': '0.0001718', 'ppl': '2.254', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11411456, 'tokens/trainable': 11287283, 'epoch': '2.013'}
 25%|███████████████████████████████████████████████                                                                                                                                                 | 1393/5680 [3:44:07<9:28:17,  7.95s/it] 25%|███████████████████████████████████████████████                                                                                                                                                 | 1394/5680 [3:44:15<9:28:07,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7526', 'grad_norm': '0.2607', 'learning_rate': '0.0001718', 'ppl': '2.123', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 11419648, 'tokens/trainable': 11295435, 'epoch': '2.013'}
 25%|███████████████████████████████████████████████                                                                                                                                                 | 1394/5680 [3:44:15<9:28:07,  7.95s/it] 25%|███████████████████████████████████████████████▏                                                                                                                                                | 1395/5680 [3:44:23<9:27:25,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6636', 'grad_norm': '0.2623', 'learning_rate': '0.0001717', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 11427840, 'tokens/trainable': 11303591, 'epoch': '2.013'}
 25%|███████████████████████████████████████████████▏                                                                                                                                                | 1395/5680 [3:44:23<9:27:25,  7.95s/it] 25%|███████████████████████████████████████████████▏                                                                                                                                                | 1396/5680 [3:44:30<9:27:34,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8911', 'grad_norm': '0.2996', 'learning_rate': '0.0001717', 'ppl': '2.438', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 11436032, 'tokens/trainable': 11311765, 'epoch': '2.014'}
 25%|███████████████████████████████████████████████▏                                                                                                                                                | 1396/5680 [3:44:30<9:27:34,  7.95s/it] 25%|███████████████████████████████████████████████▏                                                                                                                                                | 1397/5680 [3:44:38<9:27:04,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '1.045', 'grad_norm': '0.3054', 'learning_rate': '0.0001716', 'ppl': '2.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11444224, 'tokens/trainable': 11319942, 'epoch': '2.014'}
 25%|███████████████████████████████████████████████▏                                                                                                                                                | 1397/5680 [3:44:38<9:27:04,  7.94s/it] 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1398/5680 [3:44:46<9:26:30,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6689', 'grad_norm': '0.2671', 'learning_rate': '0.0001716', 'ppl': '1.952', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11452416, 'tokens/trainable': 11328113, 'epoch': '2.014'}
 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1398/5680 [3:44:46<9:26:30,  7.94s/it] 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1399/5680 [3:44:54<9:25:39,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6383', 'grad_norm': '0.2832', 'learning_rate': '0.0001716', 'ppl': '1.893', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11460608, 'tokens/trainable': 11336261, 'epoch': '2.014'}
 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1399/5680 [3:44:54<9:25:39,  7.93s/it] 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1400/5680 [3:45:02<9:24:13,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8248', 'grad_norm': '0.2828', 'learning_rate': '0.0001715', 'ppl': '2.281', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11468800, 'tokens/trainable': 11344416, 'epoch': '2.014'}
 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1400/5680 [3:45:02<9:24:13,  7.91s/it] 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1401/5680 [3:45:10<9:28:10,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6718', 'grad_norm': '0.2465', 'learning_rate': '0.0001715', 'ppl': '1.958', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 11476992, 'tokens/trainable': 11352548, 'epoch': '2.014'}
 25%|███████████████████████████████████████████████▎                                                                                                                                                | 1401/5680 [3:45:10<9:28:10,  7.97s/it] 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1402/5680 [3:45:18<9:26:42,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7556', 'grad_norm': '0.2449', 'learning_rate': '0.0001714', 'ppl': '2.129', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 11485184, 'tokens/trainable': 11360646, 'epoch': '2.015'}
 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1402/5680 [3:45:18<9:26:42,  7.95s/it] 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1403/5680 [3:45:26<9:24:48,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6418', 'grad_norm': '0.2324', 'learning_rate': '0.0001714', 'ppl': '1.9', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11493376, 'tokens/trainable': 11368808, 'epoch': '2.015'}
 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1403/5680 [3:45:26<9:24:48,  7.92s/it] 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1404/5680 [3:45:34<9:22:52,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5567', 'grad_norm': '0.2386', 'learning_rate': '0.0001714', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 11501568, 'tokens/trainable': 11376961, 'epoch': '2.015'}
 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1404/5680 [3:45:34<9:22:52,  7.90s/it] 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1405/5680 [3:45:42<9:22:29,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3673', 'grad_norm': '0.2159', 'learning_rate': '0.0001713', 'ppl': '1.444', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 11509760, 'tokens/trainable': 11385073, 'epoch': '2.015'}
 25%|███████████████████████████████████████████████▍                                                                                                                                                | 1405/5680 [3:45:42<9:22:29,  7.89s/it] 25%|███████████████████████████████████████████████▌                                                                                                                                                | 1406/5680 [3:45:50<9:21:41,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7556', 'grad_norm': '0.289', 'learning_rate': '0.0001713', 'ppl': '2.129', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 11517952, 'tokens/trainable': 11393238, 'epoch': '2.015'}
 25%|███████████████████████████████████████████████▌                                                                                                                                                | 1406/5680 [3:45:50<9:21:41,  7.89s/it] 25%|███████████████████████████████████████████████▌                                                                                                                                                | 1407/5680 [3:45:57<9:20:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6754', 'grad_norm': '0.2286', 'learning_rate': '0.0001713', 'ppl': '1.965', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11526144, 'tokens/trainable': 11401388, 'epoch': '2.015'}
 25%|███████████████████████████████████████████████▌                                                                                                                                                | 1407/5680 [3:45:57<9:20:49,  7.87s/it] 25%|███████████████████████████████████████████████▌                                                                                                                                                | 1408/5680 [3:46:05<9:20:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.949', 'grad_norm': '0.2823', 'learning_rate': '0.0001712', 'ppl': '2.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11534336, 'tokens/trainable': 11409526, 'epoch': '2.016'}
 25%|███████████████████████████████████████████████▌                                                                                                                                                | 1408/5680 [3:46:05<9:20:03,  7.87s/it] 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1409/5680 [3:46:13<9:19:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4833', 'grad_norm': '0.2229', 'learning_rate': '0.0001712', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 11542528, 'tokens/trainable': 11417651, 'epoch': '2.016'}
 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1409/5680 [3:46:13<9:19:29,  7.86s/it] 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1410/5680 [3:46:21<9:19:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7396', 'grad_norm': '0.2532', 'learning_rate': '0.0001711', 'ppl': '2.095', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 11550720, 'tokens/trainable': 11425806, 'epoch': '2.016'}
 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1410/5680 [3:46:21<9:19:42,  7.86s/it] 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1411/5680 [3:46:29<9:19:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4976', 'grad_norm': '0.2294', 'learning_rate': '0.0001711', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11558912, 'tokens/trainable': 11433950, 'epoch': '2.016'}
 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1411/5680 [3:46:29<9:19:42,  7.87s/it] 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1412/5680 [3:46:37<9:19:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5835', 'grad_norm': '0.2503', 'learning_rate': '0.0001711', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 11567104, 'tokens/trainable': 11442071, 'epoch': '2.016'}
 25%|███████████████████████████████████████████████▋                                                                                                                                                | 1412/5680 [3:46:37<9:19:02,  7.86s/it] 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1413/5680 [3:46:45<9:18:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6696', 'grad_norm': '0.2583', 'learning_rate': '0.000171', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11575296, 'tokens/trainable': 11450207, 'epoch': '2.017'}
 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1413/5680 [3:46:45<9:18:32,  7.85s/it] 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1414/5680 [3:46:52<9:18:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6143', 'grad_norm': '0.2407', 'learning_rate': '0.000171', 'ppl': '1.848', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11583488, 'tokens/trainable': 11458328, 'epoch': '2.017'}
 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1414/5680 [3:46:52<9:18:30,  7.86s/it] 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1415/5680 [3:47:00<9:19:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5422', 'grad_norm': '0.2687', 'learning_rate': '0.0001709', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11591680, 'tokens/trainable': 11466514, 'epoch': '2.017'}
 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1415/5680 [3:47:00<9:19:07,  7.87s/it] 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1416/5680 [3:47:08<9:18:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '1.012', 'grad_norm': '0.3012', 'learning_rate': '0.0001709', 'ppl': '2.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 11599872, 'tokens/trainable': 11474621, 'epoch': '2.017'}
 25%|███████████████████████████████████████████████▊                                                                                                                                                | 1416/5680 [3:47:08<9:18:37,  7.86s/it] 25%|███████████████████████████████████████████████▉                                                                                                                                                | 1417/5680 [3:47:16<9:18:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6873', 'grad_norm': '0.2455', 'learning_rate': '0.0001709', 'ppl': '1.988', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11608064, 'tokens/trainable': 11482741, 'epoch': '2.017'}
 25%|███████████████████████████████████████████████▉                                                                                                                                                | 1417/5680 [3:47:16<9:18:50,  7.87s/it] 25%|███████████████████████████████████████████████▉                                                                                                                                                | 1418/5680 [3:47:24<9:19:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6677', 'grad_norm': '0.2658', 'learning_rate': '0.0001708', 'ppl': '1.95', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 11616256, 'tokens/trainable': 11490913, 'epoch': '2.017'}
 25%|███████████████████████████████████████████████▉                                                                                                                                                | 1418/5680 [3:47:24<9:19:16,  7.87s/it] 25%|███████████████████████████████████████████████▉                                                                                                                                                | 1419/5680 [3:47:32<9:19:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4799', 'grad_norm': '0.2006', 'learning_rate': '0.0001708', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11624448, 'tokens/trainable': 11499087, 'epoch': '2.018'}
 25%|███████████████████████████████████████████████▉                                                                                                                                                | 1419/5680 [3:47:32<9:19:21,  7.88s/it] 25%|████████████████████████████████████████████████                                                                                                                                                | 1420/5680 [3:47:40<9:19:22,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6892', 'grad_norm': '0.2496', 'learning_rate': '0.0001707', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 11632640, 'tokens/trainable': 11507253, 'epoch': '2.018'}
 25%|████████████████████████████████████████████████                                                                                                                                                | 1420/5680 [3:47:40<9:19:22,  7.88s/it] 25%|████████████████████████████████████████████████                                                                                                                                                | 1421/5680 [3:47:48<9:19:46,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7991', 'grad_norm': '0.265', 'learning_rate': '0.0001707', 'ppl': '2.223', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11640832, 'tokens/trainable': 11515395, 'epoch': '2.018'}
 25%|████████████████████████████████████████████████                                                                                                                                                | 1421/5680 [3:47:48<9:19:46,  7.89s/it] 25%|████████████████████████████████████████████████                                                                                                                                                | 1422/5680 [3:47:55<9:19:47,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3832', 'grad_norm': '0.2027', 'learning_rate': '0.0001707', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11649024, 'tokens/trainable': 11523550, 'epoch': '2.018'}
 25%|████████████████████████████████████████████████                                                                                                                                                | 1422/5680 [3:47:55<9:19:47,  7.89s/it] 25%|████████████████████████████████████████████████                                                                                                                                                | 1423/5680 [3:48:03<9:19:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4407', 'grad_norm': '0.2251', 'learning_rate': '0.0001706', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11657216, 'tokens/trainable': 11531694, 'epoch': '2.018'}
 25%|████████████████████████████████████████████████                                                                                                                                                | 1423/5680 [3:48:03<9:19:13,  7.88s/it] 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1424/5680 [3:48:11<9:18:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.82', 'grad_norm': '0.3203', 'learning_rate': '0.0001706', 'ppl': '2.271', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 11665408, 'tokens/trainable': 11539867, 'epoch': '2.018'}
 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1424/5680 [3:48:11<9:18:13,  7.87s/it] 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1425/5680 [3:48:19<9:17:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4254', 'grad_norm': '0.2062', 'learning_rate': '0.0001706', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 11673600, 'tokens/trainable': 11548033, 'epoch': '2.019'}
 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1425/5680 [3:48:19<9:17:53,  7.87s/it] 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1426/5680 [3:48:27<9:17:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5069', 'grad_norm': '0.2343', 'learning_rate': '0.0001705', 'ppl': '1.66', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 11681792, 'tokens/trainable': 11556214, 'epoch': '2.019'}
 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1426/5680 [3:48:27<9:17:48,  7.87s/it] 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1427/5680 [3:48:35<9:17:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6048', 'grad_norm': '0.2363', 'learning_rate': '0.0001705', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 11689984, 'tokens/trainable': 11564398, 'epoch': '2.019'}
 25%|████████████████████████████████████████████████▏                                                                                                                                               | 1427/5680 [3:48:35<9:17:35,  7.87s/it] 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1428/5680 [3:48:43<9:16:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6839', 'grad_norm': '0.2516', 'learning_rate': '0.0001704', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 11698176, 'tokens/trainable': 11572535, 'epoch': '2.019'}
 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1428/5680 [3:48:43<9:16:41,  7.86s/it] 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1429/5680 [3:48:50<9:17:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.648', 'grad_norm': '0.2383', 'learning_rate': '0.0001704', 'ppl': '1.912', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11706368, 'tokens/trainable': 11580701, 'epoch': '2.019'}
 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1429/5680 [3:48:50<9:17:04,  7.86s/it] 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1430/5680 [3:48:58<9:17:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5394', 'grad_norm': '0.2372', 'learning_rate': '0.0001704', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11714560, 'tokens/trainable': 11588830, 'epoch': '2.02'}
 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1430/5680 [3:48:58<9:17:33,  7.87s/it] 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1431/5680 [3:49:06<9:19:00,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5037', 'grad_norm': '0.2522', 'learning_rate': '0.0001703', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 11722752, 'tokens/trainable': 11596965, 'epoch': '2.02'}
 25%|████████████████████████████████████████████████▎                                                                                                                                               | 1431/5680 [3:49:06<9:19:00,  7.89s/it] 25%|████████████████████████████████████████████████▍                                                                                                                                               | 1432/5680 [3:49:14<9:18:25,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7575', 'grad_norm': '0.2657', 'learning_rate': '0.0001703', 'ppl': '2.133', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11730944, 'tokens/trainable': 11605103, 'epoch': '2.02'}
 25%|████████████████████████████████████████████████▍                                                                                                                                               | 1432/5680 [3:49:14<9:18:25,  7.89s/it] 25%|████████████████████████████████████████████████▍                                                                                                                                               | 1433/5680 [3:49:22<9:17:38,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6971', 'grad_norm': '0.2702', 'learning_rate': '0.0001702', 'ppl': '2.008', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11739136, 'tokens/trainable': 11613246, 'epoch': '2.02'}
 25%|████████████████████████████████████████████████▍                                                                                                                                               | 1433/5680 [3:49:22<9:17:38,  7.88s/it] 25%|████████████████████████████████████████████████▍                                                                                                                                               | 1434/5680 [3:49:30<9:18:15,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7553', 'grad_norm': '0.2593', 'learning_rate': '0.0001702', 'ppl': '2.128', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 11747328, 'tokens/trainable': 11621315, 'epoch': '2.02'}
 25%|████████████████████████████████████████████████▍                                                                                                                                               | 1434/5680 [3:49:30<9:18:15,  7.89s/it] 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1435/5680 [3:49:38<9:17:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5707', 'grad_norm': '0.2601', 'learning_rate': '0.0001702', 'ppl': '1.769', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 11755520, 'tokens/trainable': 11629463, 'epoch': '2.02'}
 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1435/5680 [3:49:38<9:17:19,  7.88s/it] 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1436/5680 [3:49:46<9:17:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7909', 'grad_norm': '0.2562', 'learning_rate': '0.0001701', 'ppl': '2.205', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11763712, 'tokens/trainable': 11637606, 'epoch': '2.021'}
 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1436/5680 [3:49:46<9:17:08,  7.88s/it] 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1437/5680 [3:49:54<9:16:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5501', 'grad_norm': '0.2537', 'learning_rate': '0.0001701', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 11771904, 'tokens/trainable': 11645786, 'epoch': '2.021'}
 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1437/5680 [3:49:54<9:16:47,  7.87s/it] 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1438/5680 [3:50:01<9:15:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7211', 'grad_norm': '0.2412', 'learning_rate': '0.00017', 'ppl': '2.057', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 11780096, 'tokens/trainable': 11653957, 'epoch': '2.021'}
 25%|████████████████████████████████████████████████▌                                                                                                                                               | 1438/5680 [3:50:01<9:15:44,  7.86s/it] 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1439/5680 [3:50:09<9:15:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6063', 'grad_norm': '0.2537', 'learning_rate': '0.00017', 'ppl': '1.834', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 11788288, 'tokens/trainable': 11662119, 'epoch': '2.021'}
 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1439/5680 [3:50:09<9:15:11,  7.85s/it] 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1440/5680 [3:50:17<9:17:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.578', 'grad_norm': '0.3148', 'learning_rate': '0.00017', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 11796480, 'tokens/trainable': 11670245, 'epoch': '2.021'}
 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1440/5680 [3:50:17<9:17:10,  7.88s/it] 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1441/5680 [3:50:25<9:17:13,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5218', 'grad_norm': '0.478', 'learning_rate': '0.0001699', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 11804672, 'tokens/trainable': 11678336, 'epoch': '2.021'}
 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1441/5680 [3:50:25<9:17:13,  7.89s/it] 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1442/5680 [3:50:33<9:16:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5344', 'grad_norm': '0.2732', 'learning_rate': '0.0001699', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11812864, 'tokens/trainable': 11686491, 'epoch': '2.022'}
 25%|████████████████████████████████████████████████▋                                                                                                                                               | 1442/5680 [3:50:33<9:16:40,  7.88s/it] 25%|████████████████████████████████████████████████▊                                                                                                                                               | 1443/5680 [3:50:41<9:16:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6744', 'grad_norm': '0.2908', 'learning_rate': '0.0001698', 'ppl': '1.963', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11821056, 'tokens/trainable': 11694655, 'epoch': '2.022'}
 25%|████████████████████████████████████████████████▊                                                                                                                                               | 1443/5680 [3:50:41<9:16:40,  7.88s/it] 25%|████████████████████████████████████████████████▊                                                                                                                                               | 1444/5680 [3:50:49<9:22:36,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3831', 'grad_norm': '0.2329', 'learning_rate': '0.0001698', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.8', 'tokens/total': 11829248, 'tokens/trainable': 11702764, 'epoch': '2.022'}
 25%|████████████████████████████████████████████████▊                                                                                                                                               | 1444/5680 [3:50:49<9:22:36,  7.97s/it] 25%|████████████████████████████████████████████████▊                                                                                                                                               | 1445/5680 [3:50:57<9:20:55,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.81', 'grad_norm': '0.2763', 'learning_rate': '0.0001698', 'ppl': '2.248', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11837440, 'tokens/trainable': 11710950, 'epoch': '2.022'}
 25%|████████████████████████████████████████████████▊                                                                                                                                               | 1445/5680 [3:50:57<9:20:55,  7.95s/it] 25%|████████████████████████████████████████████████▉                                                                                                                                               | 1446/5680 [3:51:05<9:19:10,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5005', 'grad_norm': '0.2391', 'learning_rate': '0.0001697', 'ppl': '1.65', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 11845632, 'tokens/trainable': 11719140, 'epoch': '2.022'}
 25%|████████████████████████████████████████████████▉                                                                                                                                               | 1446/5680 [3:51:05<9:19:10,  7.92s/it] 25%|████████████████████████████████████████████████▉                                                                                                                                               | 1447/5680 [3:51:13<9:18:23,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5982', 'grad_norm': '0.2367', 'learning_rate': '0.0001697', 'ppl': '1.819', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11853824, 'tokens/trainable': 11727305, 'epoch': '2.023'}
 25%|████████████████████████████████████████████████▉                                                                                                                                               | 1447/5680 [3:51:13<9:18:23,  7.91s/it] 25%|████████████████████████████████████████████████▉                                                                                                                                               | 1448/5680 [3:51:21<9:17:54,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5025', 'grad_norm': '0.2168', 'learning_rate': '0.0001696', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 11862016, 'tokens/trainable': 11735410, 'epoch': '2.023'}
 25%|████████████████████████████████████████████████▉                                                                                                                                               | 1448/5680 [3:51:21<9:17:54,  7.91s/it] 26%|████████████████████████████████████████████████▉                                                                                                                                               | 1449/5680 [3:51:28<9:17:18,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5962', 'grad_norm': '0.234', 'learning_rate': '0.0001696', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 11870208, 'tokens/trainable': 11743547, 'epoch': '2.023'}
 26%|████████████████████████████████████████████████▉                                                                                                                                               | 1449/5680 [3:51:28<9:17:18,  7.90s/it] 26%|█████████████████████████████████████████████████                                                                                                                                               | 1450/5680 [3:51:36<9:16:09,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.028', 'grad_norm': '0.3499', 'learning_rate': '0.0001696', 'ppl': '2.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 11878400, 'tokens/trainable': 11751731, 'epoch': '2.023'}
 26%|█████████████████████████████████████████████████                                                                                                                                               | 1450/5680 [3:51:36<9:16:09,  7.89s/it] 26%|█████████████████████████████████████████████████                                                                                                                                               | 1451/5680 [3:51:44<9:16:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7182', 'grad_norm': '0.2622', 'learning_rate': '0.0001695', 'ppl': '2.051', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11886592, 'tokens/trainable': 11759863, 'epoch': '2.023'}
 26%|█████████████████████████████████████████████████                                                                                                                                               | 1451/5680 [3:51:44<9:16:07,  7.89s/it] 26%|█████████████████████████████████████████████████                                                                                                                                               | 1452/5680 [3:51:52<9:15:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.801', 'grad_norm': '0.3037', 'learning_rate': '0.0001695', 'ppl': '2.228', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11894784, 'tokens/trainable': 11768003, 'epoch': '2.023'}
 26%|█████████████████████████████████████████████████                                                                                                                                               | 1452/5680 [3:51:52<9:15:30,  7.88s/it] 26%|█████████████████████████████████████████████████                                                                                                                                               | 1453/5680 [3:52:00<9:15:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.3009', 'learning_rate': '0.0001694', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11902976, 'tokens/trainable': 11776135, 'epoch': '2.024'}
 26%|█████████████████████████████████████████████████                                                                                                                                               | 1453/5680 [3:52:00<9:15:00,  7.88s/it] 26%|█████████████████████████████████████████████████▏                                                                                                                                              | 1454/5680 [3:52:08<9:13:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7757', 'grad_norm': '0.2663', 'learning_rate': '0.0001694', 'ppl': '2.172', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 11911168, 'tokens/trainable': 11784300, 'epoch': '2.024'}
 26%|█████████████████████████████████████████████████▏                                                                                                                                              | 1454/5680 [3:52:08<9:13:33,  7.86s/it] 26%|█████████████████████████████████████████████████▏                                                                                                                                              | 1455/5680 [3:52:16<9:13:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6077', 'grad_norm': '0.2523', 'learning_rate': '0.0001694', 'ppl': '1.836', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11919360, 'tokens/trainable': 11792445, 'epoch': '2.024'}
 26%|█████████████████████████████████████████████████▏                                                                                                                                              | 1455/5680 [3:52:16<9:13:37,  7.86s/it] 26%|█████████████████████████████████████████████████▏                                                                                                                                              | 1456/5680 [3:52:24<9:14:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6628', 'grad_norm': '0.2814', 'learning_rate': '0.0001693', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 11927552, 'tokens/trainable': 11800608, 'epoch': '2.024'}
 26%|█████████████████████████████████████████████████▏                                                                                                                                              | 1456/5680 [3:52:24<9:14:00,  7.87s/it] 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1457/5680 [3:52:31<9:13:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6154', 'grad_norm': '0.2283', 'learning_rate': '0.0001693', 'ppl': '1.85', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 11935744, 'tokens/trainable': 11808736, 'epoch': '2.024'}
 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1457/5680 [3:52:31<9:13:44,  7.87s/it] 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1458/5680 [3:52:39<9:13:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9589', 'grad_norm': '0.2598', 'learning_rate': '0.0001692', 'ppl': '2.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 11943936, 'tokens/trainable': 11816919, 'epoch': '2.024'}
 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1458/5680 [3:52:39<9:13:37,  7.87s/it] 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1459/5680 [3:52:47<9:12:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5905', 'grad_norm': '0.265', 'learning_rate': '0.0001692', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 11952128, 'tokens/trainable': 11825062, 'epoch': '2.025'}
 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1459/5680 [3:52:47<9:12:28,  7.85s/it] 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1460/5680 [3:52:55<9:12:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6226', 'grad_norm': '0.2643', 'learning_rate': '0.0001692', 'ppl': '1.864', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 11960320, 'tokens/trainable': 11833232, 'epoch': '2.025'}
 26%|█████████████████████████████████████████████████▎                                                                                                                                              | 1460/5680 [3:52:55<9:12:34,  7.86s/it] 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1461/5680 [3:53:03<9:13:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4929', 'grad_norm': '0.2319', 'learning_rate': '0.0001691', 'ppl': '1.637', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 11968512, 'tokens/trainable': 11841355, 'epoch': '2.025'}
 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1461/5680 [3:53:03<9:13:01,  7.86s/it] 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1462/5680 [3:53:11<9:12:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6968', 'grad_norm': '0.2556', 'learning_rate': '0.0001691', 'ppl': '2.007', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 11976704, 'tokens/trainable': 11849514, 'epoch': '2.025'}
 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1462/5680 [3:53:11<9:12:16,  7.86s/it] 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1463/5680 [3:53:19<9:13:31,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5836', 'grad_norm': '0.2391', 'learning_rate': '0.000169', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 11984896, 'tokens/trainable': 11857672, 'epoch': '2.025'}
 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1463/5680 [3:53:19<9:13:31,  7.88s/it] 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1464/5680 [3:53:26<9:13:46,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6298', 'grad_norm': '0.2641', 'learning_rate': '0.000169', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 11993088, 'tokens/trainable': 11865858, 'epoch': '2.026'}
 26%|█████████████████████████████████████████████████▍                                                                                                                                              | 1464/5680 [3:53:26<9:13:46,  7.88s/it] 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1465/5680 [3:53:34<9:13:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5807', 'grad_norm': '0.2553', 'learning_rate': '0.000169', 'ppl': '1.787', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12001280, 'tokens/trainable': 11874032, 'epoch': '2.026'}
 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1465/5680 [3:53:34<9:13:53,  7.88s/it] 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1466/5680 [3:53:42<9:13:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.694', 'grad_norm': '0.253', 'learning_rate': '0.0001689', 'ppl': '2.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12009472, 'tokens/trainable': 11882185, 'epoch': '2.026'}
 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1466/5680 [3:53:42<9:13:30,  7.88s/it] 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1467/5680 [3:53:50<9:12:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7024', 'grad_norm': '0.2638', 'learning_rate': '0.0001689', 'ppl': '2.019', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 12017664, 'tokens/trainable': 11890345, 'epoch': '2.026'}
 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1467/5680 [3:53:50<9:12:22,  7.87s/it] 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1468/5680 [3:53:58<9:12:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6628', 'grad_norm': '0.3139', 'learning_rate': '0.0001688', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12025856, 'tokens/trainable': 11898481, 'epoch': '2.026'}
 26%|█████████████████████████████████████████████████▌                                                                                                                                              | 1468/5680 [3:53:58<9:12:08,  7.87s/it] 26%|█████████████████████████████████████████████████▋                                                                                                                                              | 1469/5680 [3:54:06<9:12:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7243', 'grad_norm': '0.2789', 'learning_rate': '0.0001688', 'ppl': '2.063', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12034048, 'tokens/trainable': 11906625, 'epoch': '2.026'}
 26%|█████████████████████████████████████████████████▋                                                                                                                                              | 1469/5680 [3:54:06<9:12:30,  7.87s/it] 26%|█████████████████████████████████████████████████▋                                                                                                                                              | 1470/5680 [3:54:14<9:11:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4943', 'grad_norm': '0.2233', 'learning_rate': '0.0001688', 'ppl': '1.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12042240, 'tokens/trainable': 11914767, 'epoch': '2.027'}
 26%|█████████████████████████████████████████████████▋                                                                                                                                              | 1470/5680 [3:54:14<9:11:46,  7.86s/it] 26%|█████████████████████████████████████████████████▋                                                                                                                                              | 1471/5680 [3:54:21<9:11:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6681', 'grad_norm': '0.3423', 'learning_rate': '0.0001687', 'ppl': '1.95', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 12050432, 'tokens/trainable': 11922942, 'epoch': '2.027'}
 26%|█████████████████████████████████████████████████▋                                                                                                                                              | 1471/5680 [3:54:21<9:11:25,  7.86s/it] 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1472/5680 [3:54:29<9:10:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6415', 'grad_norm': '0.3484', 'learning_rate': '0.0001687', 'ppl': '1.899', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 12058624, 'tokens/trainable': 11931126, 'epoch': '2.027'}
 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1472/5680 [3:54:29<9:10:26,  7.85s/it] 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1473/5680 [3:54:37<9:10:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8499', 'grad_norm': '0.2823', 'learning_rate': '0.0001686', 'ppl': '2.339', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12066816, 'tokens/trainable': 11939275, 'epoch': '2.027'}
 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1473/5680 [3:54:37<9:10:59,  7.86s/it] 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1474/5680 [3:54:45<9:10:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8112', 'grad_norm': '0.2641', 'learning_rate': '0.0001686', 'ppl': '2.251', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12075008, 'tokens/trainable': 11947434, 'epoch': '2.027'}
 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1474/5680 [3:54:45<9:10:56,  7.86s/it] 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1475/5680 [3:54:53<9:10:09,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8288', 'grad_norm': '0.2863', 'learning_rate': '0.0001686', 'ppl': '2.291', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 12083200, 'tokens/trainable': 11955608, 'epoch': '2.027'}
 26%|█████████████████████████████████████████████████▊                                                                                                                                              | 1475/5680 [3:54:53<9:10:09,  7.85s/it] 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1476/5680 [3:55:01<9:09:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6792', 'grad_norm': '0.253', 'learning_rate': '0.0001685', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 12091392, 'tokens/trainable': 11963791, 'epoch': '2.028'}
 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1476/5680 [3:55:01<9:09:42,  7.85s/it] 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1477/5680 [3:55:09<9:09:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5108', 'grad_norm': '0.246', 'learning_rate': '0.0001685', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12099584, 'tokens/trainable': 11971893, 'epoch': '2.028'}
 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1477/5680 [3:55:09<9:09:59,  7.85s/it] 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1478/5680 [3:55:16<9:09:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.56', 'grad_norm': '0.258', 'learning_rate': '0.0001684', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 12107776, 'tokens/trainable': 11980069, 'epoch': '2.028'}
 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1478/5680 [3:55:16<9:09:43,  7.85s/it] 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1479/5680 [3:55:24<9:10:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5177', 'grad_norm': '0.2517', 'learning_rate': '0.0001684', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12115968, 'tokens/trainable': 11988202, 'epoch': '2.028'}
 26%|█████████████████████████████████████████████████▉                                                                                                                                              | 1479/5680 [3:55:24<9:10:03,  7.86s/it] 26%|██████████████████████████████████████████████████                                                                                                                                              | 1480/5680 [3:55:32<9:11:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4251', 'grad_norm': '0.2391', 'learning_rate': '0.0001684', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12124160, 'tokens/trainable': 11996365, 'epoch': '2.028'}
 26%|██████████████████████████████████████████████████                                                                                                                                              | 1480/5680 [3:55:32<9:11:02,  7.87s/it] 26%|██████████████████████████████████████████████████                                                                                                                                              | 1481/5680 [3:55:40<9:10:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6458', 'grad_norm': '0.2795', 'learning_rate': '0.0001683', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12132352, 'tokens/trainable': 12004508, 'epoch': '2.029'}
 26%|██████████████████████████████████████████████████                                                                                                                                              | 1481/5680 [3:55:40<9:10:56,  7.87s/it] 26%|██████████████████████████████████████████████████                                                                                                                                              | 1482/5680 [3:55:48<9:16:50,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7119', 'grad_norm': '0.2513', 'learning_rate': '0.0001683', 'ppl': '2.038', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.3', 'tokens/total': 12140544, 'tokens/trainable': 12012636, 'epoch': '2.029'}
 26%|██████████████████████████████████████████████████                                                                                                                                              | 1482/5680 [3:55:48<9:16:50,  7.96s/it] 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1483/5680 [3:55:56<9:13:52,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5428', 'grad_norm': '0.2315', 'learning_rate': '0.0001682', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 12148736, 'tokens/trainable': 12020791, 'epoch': '2.029'}
 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1483/5680 [3:55:56<9:13:52,  7.92s/it] 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1484/5680 [3:56:04<9:11:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6813', 'grad_norm': '0.2789', 'learning_rate': '0.0001682', 'ppl': '1.977', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 12156928, 'tokens/trainable': 12028929, 'epoch': '2.029'}
 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1484/5680 [3:56:04<9:11:10,  7.88s/it] 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1485/5680 [3:56:12<9:10:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6605', 'grad_norm': '0.2444', 'learning_rate': '0.0001682', 'ppl': '1.936', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 12165120, 'tokens/trainable': 12037094, 'epoch': '2.029'}
 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1485/5680 [3:56:12<9:10:25,  7.87s/it] 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1486/5680 [3:56:20<9:10:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5518', 'grad_norm': '0.281', 'learning_rate': '0.0001681', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 12173312, 'tokens/trainable': 12045278, 'epoch': '2.029'}
 26%|██████████████████████████████████████████████████▏                                                                                                                                             | 1486/5680 [3:56:20<9:10:26,  7.87s/it] 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1487/5680 [3:56:28<9:15:39,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6023', 'grad_norm': '0.2548', 'learning_rate': '0.0001681', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 12181504, 'tokens/trainable': 12053422, 'epoch': '2.03'}
 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1487/5680 [3:56:28<9:15:39,  7.95s/it] 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1488/5680 [3:56:36<9:13:48,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5829', 'grad_norm': '0.2695', 'learning_rate': '0.000168', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 12189696, 'tokens/trainable': 12061611, 'epoch': '2.03'}
 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1488/5680 [3:56:36<9:13:48,  7.93s/it] 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1489/5680 [3:56:44<9:14:00,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5569', 'grad_norm': '0.2743', 'learning_rate': '0.000168', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 12197888, 'tokens/trainable': 12069748, 'epoch': '2.03'}
 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1489/5680 [3:56:44<9:14:00,  7.93s/it] 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1490/5680 [3:56:51<9:12:22,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7624', 'grad_norm': '0.2559', 'learning_rate': '0.000168', 'ppl': '2.144', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 12206080, 'tokens/trainable': 12077919, 'epoch': '2.03'}
 26%|██████████████████████████████████████████████████▎                                                                                                                                             | 1490/5680 [3:56:51<9:12:22,  7.91s/it] 26%|██████████████████████████████████████████████████▍                                                                                                                                             | 1491/5680 [3:56:59<9:11:15,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5573', 'grad_norm': '0.243', 'learning_rate': '0.0001679', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 12214272, 'tokens/trainable': 12086110, 'epoch': '2.03'}
 26%|██████████████████████████████████████████████████▍                                                                                                                                             | 1491/5680 [3:56:59<9:11:15,  7.90s/it] 26%|██████████████████████████████████████████████████▍                                                                                                                                             | 1492/5680 [3:57:07<9:11:55,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5207', 'grad_norm': '0.2235', 'learning_rate': '0.0001679', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 12222464, 'tokens/trainable': 12094246, 'epoch': '2.03'}
 26%|██████████████████████████████████████████████████▍                                                                                                                                             | 1492/5680 [3:57:07<9:11:55,  7.91s/it] 26%|██████████████████████████████████████████████████▍                                                                                                                                             | 1493/5680 [3:57:15<9:12:04,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5393', 'grad_norm': '0.2569', 'learning_rate': '0.0001678', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 12230656, 'tokens/trainable': 12102327, 'epoch': '2.031'}
 26%|██████████████████████████████████████████████████▍                                                                                                                                             | 1493/5680 [3:57:15<9:12:04,  7.91s/it] 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1494/5680 [3:57:23<9:12:05,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6115', 'grad_norm': '0.2526', 'learning_rate': '0.0001678', 'ppl': '1.843', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 12238848, 'tokens/trainable': 12110421, 'epoch': '2.031'}
 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1494/5680 [3:57:23<9:12:05,  7.91s/it] 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1495/5680 [3:57:31<9:11:25,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7531', 'grad_norm': '0.2652', 'learning_rate': '0.0001678', 'ppl': '2.124', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12247040, 'tokens/trainable': 12118574, 'epoch': '2.031'}
 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1495/5680 [3:57:31<9:11:25,  7.91s/it] 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1496/5680 [3:57:39<9:10:31,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7783', 'grad_norm': '0.2882', 'learning_rate': '0.0001677', 'ppl': '2.178', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 12255232, 'tokens/trainable': 12126754, 'epoch': '2.031'}
 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1496/5680 [3:57:39<9:10:31,  7.89s/it] 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1497/5680 [3:57:47<9:09:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5234', 'grad_norm': '0.2618', 'learning_rate': '0.0001677', 'ppl': '1.688', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 12263424, 'tokens/trainable': 12134917, 'epoch': '2.031'}
 26%|██████████████████████████████████████████████████▌                                                                                                                                             | 1497/5680 [3:57:47<9:09:34,  7.88s/it] 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1498/5680 [3:57:55<9:09:48,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7874', 'grad_norm': '0.3012', 'learning_rate': '0.0001676', 'ppl': '2.198', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 12271616, 'tokens/trainable': 12143058, 'epoch': '2.032'}
 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1498/5680 [3:57:55<9:09:48,  7.89s/it] 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1499/5680 [3:58:02<9:08:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9725', 'grad_norm': '0.277', 'learning_rate': '0.0001676', 'ppl': '2.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12279808, 'tokens/trainable': 12151182, 'epoch': '2.032'}
 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1499/5680 [3:58:02<9:08:56,  7.88s/it] 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1500/5680 [3:58:10<9:08:50,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6762', 'grad_norm': '0.2423', 'learning_rate': '0.0001676', 'ppl': '1.966', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 12288000, 'tokens/trainable': 12159365, 'epoch': '2.032'}
 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1500/5680 [3:58:10<9:08:50,  7.88s/it] 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1501/5680 [3:58:18<9:08:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5699', 'grad_norm': '0.2324', 'learning_rate': '0.0001675', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12296192, 'tokens/trainable': 12167502, 'epoch': '2.032'}
 26%|██████████████████████████████████████████████████▋                                                                                                                                             | 1501/5680 [3:58:18<9:08:37,  7.88s/it] 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1502/5680 [3:58:26<9:08:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8455', 'grad_norm': '0.27', 'learning_rate': '0.0001675', 'ppl': '2.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 12304384, 'tokens/trainable': 12175687, 'epoch': '2.032'}
 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1502/5680 [3:58:26<9:08:11,  7.87s/it] 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1503/5680 [3:58:34<9:08:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4649', 'grad_norm': '0.2463', 'learning_rate': '0.0001674', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12312576, 'tokens/trainable': 12183855, 'epoch': '2.032'}
 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1503/5680 [3:58:34<9:08:41,  7.88s/it] 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1504/5680 [3:58:42<9:07:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6745', 'grad_norm': '0.242', 'learning_rate': '0.0001674', 'ppl': '1.963', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 12320768, 'tokens/trainable': 12191990, 'epoch': '2.033'}
 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1504/5680 [3:58:42<9:07:51,  7.87s/it] 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1505/5680 [3:58:50<9:07:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7071', 'grad_norm': '0.2532', 'learning_rate': '0.0001674', 'ppl': '2.028', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12328960, 'tokens/trainable': 12200081, 'epoch': '2.033'}
 26%|██████████████████████████████████████████████████▊                                                                                                                                             | 1505/5680 [3:58:50<9:07:06,  7.86s/it] 27%|██████████████████████████████████████████████████▉                                                                                                                                             | 1506/5680 [3:58:57<9:06:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6748', 'grad_norm': '0.2825', 'learning_rate': '0.0001673', 'ppl': '1.964', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12337152, 'tokens/trainable': 12208215, 'epoch': '2.033'}
 27%|██████████████████████████████████████████████████▉                                                                                                                                             | 1506/5680 [3:58:57<9:06:52,  7.86s/it] 27%|██████████████████████████████████████████████████▉                                                                                                                                             | 1507/5680 [3:59:05<9:06:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.709', 'grad_norm': '0.2769', 'learning_rate': '0.0001673', 'ppl': '2.032', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 12345344, 'tokens/trainable': 12216398, 'epoch': '2.033'}
 27%|██████████████████████████████████████████████████▉                                                                                                                                             | 1507/5680 [3:59:05<9:06:42,  7.86s/it] 27%|██████████████████████████████████████████████████▉                                                                                                                                             | 1508/5680 [3:59:13<9:05:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8211', 'grad_norm': '0.2626', 'learning_rate': '0.0001672', 'ppl': '2.273', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 12353536, 'tokens/trainable': 12224559, 'epoch': '2.033'}
 27%|██████████████████████████████████████████████████▉                                                                                                                                             | 1508/5680 [3:59:13<9:05:46,  7.85s/it] 27%|███████████████████████████████████████████████████                                                                                                                                             | 1509/5680 [3:59:21<9:05:41,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5846', 'grad_norm': '0.2614', 'learning_rate': '0.0001672', 'ppl': '1.794', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 12361728, 'tokens/trainable': 12232726, 'epoch': '2.033'}
 27%|███████████████████████████████████████████████████                                                                                                                                             | 1509/5680 [3:59:21<9:05:41,  7.85s/it] 27%|███████████████████████████████████████████████████                                                                                                                                             | 1510/5680 [3:59:29<9:06:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4124', 'grad_norm': '0.2232', 'learning_rate': '0.0001671', 'ppl': '1.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12369920, 'tokens/trainable': 12240871, 'epoch': '2.034'}
 27%|███████████████████████████████████████████████████                                                                                                                                             | 1510/5680 [3:59:29<9:06:38,  7.87s/it] 27%|███████████████████████████████████████████████████                                                                                                                                             | 1511/5680 [3:59:37<9:06:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6536', 'grad_norm': '0.2598', 'learning_rate': '0.0001671', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12378112, 'tokens/trainable': 12249008, 'epoch': '2.034'}
 27%|███████████████████████████████████████████████████                                                                                                                                             | 1511/5680 [3:59:37<9:06:49,  7.87s/it] 27%|███████████████████████████████████████████████████                                                                                                                                             | 1512/5680 [3:59:45<9:07:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6411', 'grad_norm': '0.258', 'learning_rate': '0.0001671', 'ppl': '1.899', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12386304, 'tokens/trainable': 12257129, 'epoch': '2.034'}
 27%|███████████████████████████████████████████████████                                                                                                                                             | 1512/5680 [3:59:45<9:07:05,  7.88s/it] 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1513/5680 [3:59:53<9:06:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7298', 'grad_norm': '0.2762', 'learning_rate': '0.000167', 'ppl': '2.075', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 12394496, 'tokens/trainable': 12265301, 'epoch': '2.034'}
 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1513/5680 [3:59:53<9:06:32,  7.87s/it] 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1514/5680 [4:00:00<9:06:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.735', 'grad_norm': '0.2604', 'learning_rate': '0.000167', 'ppl': '2.086', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12402688, 'tokens/trainable': 12273454, 'epoch': '2.034'}
 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1514/5680 [4:00:00<9:06:22,  7.87s/it] 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1515/5680 [4:00:08<9:05:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5935', 'grad_norm': '0.2885', 'learning_rate': '0.0001669', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12410880, 'tokens/trainable': 12281576, 'epoch': '2.035'}
 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1515/5680 [4:00:08<9:05:52,  7.86s/it] 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1516/5680 [4:00:16<9:05:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6568', 'grad_norm': '0.2607', 'learning_rate': '0.0001669', 'ppl': '1.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12419072, 'tokens/trainable': 12289723, 'epoch': '2.035'}
 27%|███████████████████████████████████████████████████▏                                                                                                                                            | 1516/5680 [4:00:16<9:05:29,  7.86s/it] 27%|███████████████████████████████████████████████████▎                                                                                                                                            | 1517/5680 [4:00:24<9:05:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5837', 'grad_norm': '0.2578', 'learning_rate': '0.0001669', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12427264, 'tokens/trainable': 12297849, 'epoch': '2.035'}
 27%|███████████████████████████████████████████████████▎                                                                                                                                            | 1517/5680 [4:00:24<9:05:42,  7.87s/it] 27%|███████████████████████████████████████████████████▎                                                                                                                                            | 1518/5680 [4:00:32<9:04:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5086', 'grad_norm': '0.2211', 'learning_rate': '0.0001668', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 12435456, 'tokens/trainable': 12306039, 'epoch': '2.035'}
 27%|███████████████████████████████████████████████████▎                                                                                                                                            | 1518/5680 [4:00:32<9:04:53,  7.86s/it] 27%|███████████████████████████████████████████████████▎                                                                                                                                            | 1519/5680 [4:00:40<9:05:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8002', 'grad_norm': '0.2637', 'learning_rate': '0.0001668', 'ppl': '2.226', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12443648, 'tokens/trainable': 12314217, 'epoch': '2.035'}
 27%|███████████████████████████████████████████████████▎                                                                                                                                            | 1519/5680 [4:00:40<9:05:35,  7.87s/it] 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1520/5680 [4:00:48<9:05:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7782', 'grad_norm': '0.3505', 'learning_rate': '0.0001667', 'ppl': '2.178', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12451840, 'tokens/trainable': 12322331, 'epoch': '2.035'}
 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1520/5680 [4:00:48<9:05:46,  7.87s/it] 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1521/5680 [4:00:55<9:05:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5139', 'grad_norm': '0.2448', 'learning_rate': '0.0001667', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12460032, 'tokens/trainable': 12330456, 'epoch': '2.036'}
 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1521/5680 [4:00:55<9:05:07,  7.86s/it] 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1522/5680 [4:01:03<9:05:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6609', 'grad_norm': '0.2613', 'learning_rate': '0.0001667', 'ppl': '1.936', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12468224, 'tokens/trainable': 12338622, 'epoch': '2.036'}
 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1522/5680 [4:01:03<9:05:08,  7.87s/it] 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1523/5680 [4:01:11<9:05:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5771', 'grad_norm': '0.2437', 'learning_rate': '0.0001666', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12476416, 'tokens/trainable': 12346806, 'epoch': '2.036'}
 27%|███████████████████████████████████████████████████▍                                                                                                                                            | 1523/5680 [4:01:11<9:05:41,  7.88s/it] 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1524/5680 [4:01:19<9:04:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4926', 'grad_norm': '0.2305', 'learning_rate': '0.0001666', 'ppl': '1.637', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12484608, 'tokens/trainable': 12354946, 'epoch': '2.036'}
 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1524/5680 [4:01:19<9:04:56,  7.87s/it] 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1525/5680 [4:01:27<9:04:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5331', 'grad_norm': '0.2569', 'learning_rate': '0.0001665', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12492800, 'tokens/trainable': 12363064, 'epoch': '2.036'}
 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1525/5680 [4:01:27<9:04:19,  7.86s/it] 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1526/5680 [4:01:35<9:04:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8064', 'grad_norm': '0.2633', 'learning_rate': '0.0001665', 'ppl': '2.24', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 12500992, 'tokens/trainable': 12371204, 'epoch': '2.036'}
 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1526/5680 [4:01:35<9:04:04,  7.86s/it] 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1527/5680 [4:01:43<9:04:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7769', 'grad_norm': '0.2667', 'learning_rate': '0.0001664', 'ppl': '2.175', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12509184, 'tokens/trainable': 12379352, 'epoch': '2.037'}
 27%|███████████████████████████████████████████████████▌                                                                                                                                            | 1527/5680 [4:01:43<9:04:29,  7.87s/it] 27%|███████████████████████████████████████████████████▋                                                                                                                                            | 1528/5680 [4:01:51<9:04:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5549', 'grad_norm': '0.3037', 'learning_rate': '0.0001664', 'ppl': '1.742', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 12517376, 'tokens/trainable': 12387436, 'epoch': '2.037'}
 27%|███████████████████████████████████████████████████▋                                                                                                                                            | 1528/5680 [4:01:51<9:04:25,  7.87s/it] 27%|███████████████████████████████████████████████████▋                                                                                                                                            | 1529/5680 [4:01:58<9:05:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.741', 'grad_norm': '0.3479', 'learning_rate': '0.0001664', 'ppl': '2.098', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12525568, 'tokens/trainable': 12395614, 'epoch': '2.037'}
 27%|███████████████████████████████████████████████████▋                                                                                                                                            | 1529/5680 [4:01:58<9:05:10,  7.88s/it] 27%|███████████████████████████████████████████████████▋                                                                                                                                            | 1530/5680 [4:02:07<9:10:05,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6963', 'grad_norm': '0.2589', 'learning_rate': '0.0001663', 'ppl': '2.006', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 12533760, 'tokens/trainable': 12403764, 'epoch': '2.037'}
 27%|███████████████████████████████████████████████████▋                                                                                                                                            | 1530/5680 [4:02:07<9:10:05,  7.95s/it] 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1531/5680 [4:02:14<9:09:40,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8912', 'grad_norm': '0.318', 'learning_rate': '0.0001663', 'ppl': '2.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12541952, 'tokens/trainable': 12411941, 'epoch': '2.037'}
 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1531/5680 [4:02:14<9:09:40,  7.95s/it] 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1532/5680 [4:02:22<9:08:31,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7294', 'grad_norm': '0.2688', 'learning_rate': '0.0001662', 'ppl': '2.074', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 12550144, 'tokens/trainable': 12420119, 'epoch': '2.038'}
 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1532/5680 [4:02:22<9:08:31,  7.93s/it] 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1533/5680 [4:02:30<9:07:53,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5421', 'grad_norm': '0.2239', 'learning_rate': '0.0001662', 'ppl': '1.72', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12558336, 'tokens/trainable': 12428306, 'epoch': '2.038'}
 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1533/5680 [4:02:30<9:07:53,  7.93s/it] 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1534/5680 [4:02:38<9:07:38,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6573', 'grad_norm': '0.2679', 'learning_rate': '0.0001662', 'ppl': '1.93', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 12566528, 'tokens/trainable': 12436369, 'epoch': '2.038'}
 27%|███████████████████████████████████████████████████▊                                                                                                                                            | 1534/5680 [4:02:38<9:07:38,  7.93s/it] 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1535/5680 [4:02:46<9:06:17,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8137', 'grad_norm': '0.2928', 'learning_rate': '0.0001661', 'ppl': '2.256', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12574720, 'tokens/trainable': 12444506, 'epoch': '2.038'}
 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1535/5680 [4:02:46<9:06:17,  7.91s/it] 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1536/5680 [4:02:54<9:05:12,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6201', 'grad_norm': '0.2634', 'learning_rate': '0.0001661', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 12582912, 'tokens/trainable': 12452661, 'epoch': '2.038'}
 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1536/5680 [4:02:54<9:05:12,  7.89s/it] 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1537/5680 [4:03:02<9:05:01,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6296', 'grad_norm': '0.3195', 'learning_rate': '0.000166', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 12591104, 'tokens/trainable': 12460753, 'epoch': '2.038'}
 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1537/5680 [4:03:02<9:05:01,  7.89s/it] 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1538/5680 [4:03:10<9:04:49,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4762', 'grad_norm': '0.3199', 'learning_rate': '0.000166', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12599296, 'tokens/trainable': 12468908, 'epoch': '2.039'}
 27%|███████████████████████████████████████████████████▉                                                                                                                                            | 1538/5680 [4:03:10<9:04:49,  7.89s/it] 27%|████████████████████████████████████████████████████                                                                                                                                            | 1539/5680 [4:03:18<9:06:00,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5228', 'grad_norm': '0.2373', 'learning_rate': '0.0001659', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 12607488, 'tokens/trainable': 12477063, 'epoch': '2.039'}
 27%|████████████████████████████████████████████████████                                                                                                                                            | 1539/5680 [4:03:18<9:06:00,  7.91s/it] 27%|████████████████████████████████████████████████████                                                                                                                                            | 1540/5680 [4:03:26<9:06:15,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8639', 'grad_norm': '0.2895', 'learning_rate': '0.0001659', 'ppl': '2.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12615680, 'tokens/trainable': 12485251, 'epoch': '2.039'}
 27%|████████████████████████████████████████████████████                                                                                                                                            | 1540/5680 [4:03:26<9:06:15,  7.92s/it] 27%|████████████████████████████████████████████████████                                                                                                                                            | 1541/5680 [4:03:34<9:06:57,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6991', 'grad_norm': '0.3191', 'learning_rate': '0.0001659', 'ppl': '2.012', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 12623872, 'tokens/trainable': 12493408, 'epoch': '2.039'}
 27%|████████████████████████████████████████████████████                                                                                                                                            | 1541/5680 [4:03:34<9:06:57,  7.93s/it] 27%|████████████████████████████████████████████████████                                                                                                                                            | 1542/5680 [4:03:41<9:06:36,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5242', 'grad_norm': '0.2432', 'learning_rate': '0.0001658', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12632064, 'tokens/trainable': 12501561, 'epoch': '2.039'}
 27%|████████████████████████████████████████████████████                                                                                                                                            | 1542/5680 [4:03:41<9:06:36,  7.93s/it] 27%|████████████████████████████████████████████████████▏                                                                                                                                           | 1543/5680 [4:03:49<9:06:35,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '1.001', 'grad_norm': '0.3444', 'learning_rate': '0.0001658', 'ppl': '2.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 12640256, 'tokens/trainable': 12509738, 'epoch': '2.039'}
 27%|████████████████████████████████████████████████████▏                                                                                                                                           | 1543/5680 [4:03:49<9:06:35,  7.93s/it] 27%|████████████████████████████████████████████████████▏                                                                                                                                           | 1544/5680 [4:03:57<9:07:01,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5812', 'grad_norm': '0.2383', 'learning_rate': '0.0001657', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 12648448, 'tokens/trainable': 12517875, 'epoch': '2.04'}
 27%|████████████████████████████████████████████████████▏                                                                                                                                           | 1544/5680 [4:03:57<9:07:01,  7.94s/it] 27%|████████████████████████████████████████████████████▏                                                                                                                                           | 1545/5680 [4:04:05<9:06:33,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.8796', 'grad_norm': '0.3146', 'learning_rate': '0.0001657', 'ppl': '2.41', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 12656640, 'tokens/trainable': 12526032, 'epoch': '2.04'}
 27%|████████████████████████████████████████████████████▏                                                                                                                                           | 1545/5680 [4:04:05<9:06:33,  7.93s/it] 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1546/5680 [4:04:13<9:06:18,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7726', 'grad_norm': '0.2433', 'learning_rate': '0.0001657', 'ppl': '2.165', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 12664832, 'tokens/trainable': 12534183, 'epoch': '2.04'}
 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1546/5680 [4:04:13<9:06:18,  7.93s/it] 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1547/5680 [4:04:21<9:06:38,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8054', 'grad_norm': '0.2681', 'learning_rate': '0.0001656', 'ppl': '2.238', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 12673024, 'tokens/trainable': 12542308, 'epoch': '2.04'}
 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1547/5680 [4:04:21<9:06:38,  7.94s/it] 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1548/5680 [4:04:29<9:05:37,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.63', 'grad_norm': '0.2559', 'learning_rate': '0.0001656', 'ppl': '1.878', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12681216, 'tokens/trainable': 12550449, 'epoch': '2.04'}
 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1548/5680 [4:04:29<9:05:37,  7.92s/it] 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1549/5680 [4:04:37<9:05:55,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.726', 'grad_norm': '0.2708', 'learning_rate': '0.0001655', 'ppl': '2.067', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12689408, 'tokens/trainable': 12558627, 'epoch': '2.04'}
 27%|████████████████████████████████████████████████████▎                                                                                                                                           | 1549/5680 [4:04:37<9:05:55,  7.93s/it] 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1550/5680 [4:04:45<9:05:44,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7154', 'grad_norm': '0.261', 'learning_rate': '0.0001655', 'ppl': '2.045', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 12697600, 'tokens/trainable': 12566766, 'epoch': '2.041'}
 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1550/5680 [4:04:45<9:05:44,  7.93s/it] 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1551/5680 [4:04:53<9:05:48,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6742', 'grad_norm': '0.2969', 'learning_rate': '0.0001654', 'ppl': '1.962', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 12705792, 'tokens/trainable': 12574892, 'epoch': '2.041'}
 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1551/5680 [4:04:53<9:05:48,  7.93s/it] 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1552/5680 [4:05:01<9:05:37,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5384', 'grad_norm': '0.2919', 'learning_rate': '0.0001654', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 12713984, 'tokens/trainable': 12582986, 'epoch': '2.041'}
 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1552/5680 [4:05:01<9:05:37,  7.93s/it] 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1553/5680 [4:05:09<9:04:54,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5398', 'grad_norm': '0.2562', 'learning_rate': '0.0001654', 'ppl': '1.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 12722176, 'tokens/trainable': 12591097, 'epoch': '2.041'}
 27%|████████████████████████████████████████████████████▍                                                                                                                                           | 1553/5680 [4:05:09<9:04:54,  7.92s/it] 27%|████████████████████████████████████████████████████▌                                                                                                                                           | 1554/5680 [4:05:17<9:04:33,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.8079', 'grad_norm': '0.2882', 'learning_rate': '0.0001653', 'ppl': '2.243', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 12730368, 'tokens/trainable': 12599227, 'epoch': '2.041'}
 27%|████████████████████████████████████████████████████▌                                                                                                                                           | 1554/5680 [4:05:17<9:04:33,  7.92s/it] 27%|████████████████████████████████████████████████████▌                                                                                                                                           | 1555/5680 [4:05:25<9:04:51,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7404', 'grad_norm': '0.2853', 'learning_rate': '0.0001653', 'ppl': '2.097', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 12738560, 'tokens/trainable': 12607379, 'epoch': '2.042'}
 27%|████████████████████████████████████████████████████▌                                                                                                                                           | 1555/5680 [4:05:25<9:04:51,  7.93s/it] 27%|████████████████████████████████████████████████████▌                                                                                                                                           | 1556/5680 [4:05:32<9:04:29,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6026', 'grad_norm': '0.3114', 'learning_rate': '0.0001652', 'ppl': '1.827', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12746752, 'tokens/trainable': 12615553, 'epoch': '2.042'}
 27%|████████████████████████████████████████████████████▌                                                                                                                                           | 1556/5680 [4:05:32<9:04:29,  7.92s/it] 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1557/5680 [4:05:40<9:03:34,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4379', 'grad_norm': '0.2449', 'learning_rate': '0.0001652', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12754944, 'tokens/trainable': 12623694, 'epoch': '2.042'}
 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1557/5680 [4:05:40<9:03:34,  7.91s/it] 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1558/5680 [4:05:48<9:02:43,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6882', 'grad_norm': '0.2767', 'learning_rate': '0.0001652', 'ppl': '1.99', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12763136, 'tokens/trainable': 12631820, 'epoch': '2.042'}
 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1558/5680 [4:05:48<9:02:43,  7.90s/it] 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1559/5680 [4:05:56<9:03:14,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5947', 'grad_norm': '0.3367', 'learning_rate': '0.0001651', 'ppl': '1.812', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 12771328, 'tokens/trainable': 12639936, 'epoch': '2.042'}
 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1559/5680 [4:05:56<9:03:14,  7.91s/it] 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1560/5680 [4:06:04<9:03:06,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.823', 'grad_norm': '0.2934', 'learning_rate': '0.0001651', 'ppl': '2.277', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12779520, 'tokens/trainable': 12648097, 'epoch': '2.042'}
 27%|████████████████████████████████████████████████████▋                                                                                                                                           | 1560/5680 [4:06:04<9:03:06,  7.91s/it] 27%|████████████████████████████████████████████████████▊                                                                                                                                           | 1561/5680 [4:06:12<9:02:59,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6937', 'grad_norm': '0.2798', 'learning_rate': '0.000165', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 12787712, 'tokens/trainable': 12656177, 'epoch': '2.043'}
 27%|████████████████████████████████████████████████████▊                                                                                                                                           | 1561/5680 [4:06:12<9:02:59,  7.91s/it] 28%|████████████████████████████████████████████████████▊                                                                                                                                           | 1562/5680 [4:06:20<9:02:24,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6522', 'grad_norm': '0.285', 'learning_rate': '0.000165', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12795904, 'tokens/trainable': 12664334, 'epoch': '2.043'}
 28%|████████████████████████████████████████████████████▊                                                                                                                                           | 1562/5680 [4:06:20<9:02:24,  7.90s/it] 28%|████████████████████████████████████████████████████▊                                                                                                                                           | 1563/5680 [4:06:28<9:02:27,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5139', 'grad_norm': '0.2188', 'learning_rate': '0.0001649', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12804096, 'tokens/trainable': 12672523, 'epoch': '2.043'}
 28%|████████████████████████████████████████████████████▊                                                                                                                                           | 1563/5680 [4:06:28<9:02:27,  7.91s/it] 28%|████████████████████████████████████████████████████▊                                                                                                                                           | 1564/5680 [4:06:36<9:02:54,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7977', 'grad_norm': '0.2508', 'learning_rate': '0.0001649', 'ppl': '2.22', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 12812288, 'tokens/trainable': 12680662, 'epoch': '2.043'}
 28%|████████████████████████████████████████████████████▊                                                                                                                                           | 1564/5680 [4:06:36<9:02:54,  7.91s/it] 28%|████████████████████████████████████████████████████▉                                                                                                                                           | 1565/5680 [4:06:44<9:02:31,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5088', 'grad_norm': '0.227', 'learning_rate': '0.0001649', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12820480, 'tokens/trainable': 12688798, 'epoch': '2.043'}
 28%|████████████████████████████████████████████████████▉                                                                                                                                           | 1565/5680 [4:06:44<9:02:31,  7.91s/it] 28%|████████████████████████████████████████████████████▉                                                                                                                                           | 1566/5680 [4:06:51<9:01:06,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8524', 'grad_norm': '0.2644', 'learning_rate': '0.0001648', 'ppl': '2.345', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12828672, 'tokens/trainable': 12696910, 'epoch': '2.043'}
 28%|████████████████████████████████████████████████████▉                                                                                                                                           | 1566/5680 [4:06:51<9:01:06,  7.89s/it] 28%|████████████████████████████████████████████████████▉                                                                                                                                           | 1567/5680 [4:06:59<9:00:52,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.9815', 'grad_norm': '0.3125', 'learning_rate': '0.0001648', 'ppl': '2.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12836864, 'tokens/trainable': 12705094, 'epoch': '2.044'}
 28%|████████████████████████████████████████████████████▉                                                                                                                                           | 1567/5680 [4:06:59<9:00:52,  7.89s/it] 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1568/5680 [4:07:07<9:01:00,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8163', 'grad_norm': '0.3254', 'learning_rate': '0.0001647', 'ppl': '2.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12845056, 'tokens/trainable': 12713246, 'epoch': '2.044'}
 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1568/5680 [4:07:07<9:01:00,  7.89s/it] 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1569/5680 [4:07:15<9:01:05,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6879', 'grad_norm': '0.2527', 'learning_rate': '0.0001647', 'ppl': '1.989', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 12853248, 'tokens/trainable': 12721421, 'epoch': '2.044'}
 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1569/5680 [4:07:15<9:01:05,  7.90s/it] 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1570/5680 [4:07:23<9:01:27,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8189', 'grad_norm': '0.3041', 'learning_rate': '0.0001646', 'ppl': '2.268', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 12861440, 'tokens/trainable': 12729555, 'epoch': '2.044'}
 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1570/5680 [4:07:23<9:01:27,  7.90s/it] 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1571/5680 [4:07:31<9:01:27,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7652', 'grad_norm': '0.2872', 'learning_rate': '0.0001646', 'ppl': '2.149', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12869632, 'tokens/trainable': 12737715, 'epoch': '2.044'}
 28%|█████████████████████████████████████████████████████                                                                                                                                           | 1571/5680 [4:07:31<9:01:27,  7.91s/it] 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1572/5680 [4:07:39<9:01:52,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5907', 'grad_norm': '0.2859', 'learning_rate': '0.0001646', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 12877824, 'tokens/trainable': 12745899, 'epoch': '2.045'}
 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1572/5680 [4:07:39<9:01:52,  7.91s/it] 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1573/5680 [4:07:47<9:09:01,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4393', 'grad_norm': '0.2009', 'learning_rate': '0.0001645', 'ppl': '1.552', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.2', 'tokens/total': 12886016, 'tokens/trainable': 12754069, 'epoch': '2.045'}
 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1573/5680 [4:07:47<9:09:01,  8.02s/it] 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1574/5680 [4:07:55<9:05:58,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6494', 'grad_norm': '0.2719', 'learning_rate': '0.0001645', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12894208, 'tokens/trainable': 12762246, 'epoch': '2.045'}
 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1574/5680 [4:07:55<9:05:58,  7.98s/it] 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1575/5680 [4:08:03<9:03:25,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5414', 'grad_norm': '0.2626', 'learning_rate': '0.0001644', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 12902400, 'tokens/trainable': 12770401, 'epoch': '2.045'}
 28%|█████████████████████████████████████████████████████▏                                                                                                                                          | 1575/5680 [4:08:03<9:03:25,  7.94s/it] 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1576/5680 [4:08:11<9:01:25,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4609', 'grad_norm': '0.2286', 'learning_rate': '0.0001644', 'ppl': '1.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 12910592, 'tokens/trainable': 12778498, 'epoch': '2.045'}
 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1576/5680 [4:08:11<9:01:25,  7.92s/it] 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1577/5680 [4:08:19<9:00:59,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5557', 'grad_norm': '0.2492', 'learning_rate': '0.0001644', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12918784, 'tokens/trainable': 12786656, 'epoch': '2.045'}
 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1577/5680 [4:08:19<9:00:59,  7.91s/it] 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1578/5680 [4:08:27<9:00:10,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4408', 'grad_norm': '0.2451', 'learning_rate': '0.0001643', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 12926976, 'tokens/trainable': 12794792, 'epoch': '2.046'}
 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1578/5680 [4:08:27<9:00:10,  7.90s/it] 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1579/5680 [4:08:34<9:00:46,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6561', 'grad_norm': '0.2876', 'learning_rate': '0.0001643', 'ppl': '1.927', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 12935168, 'tokens/trainable': 12802964, 'epoch': '2.046'}
 28%|█████████████████████████████████████████████████████▎                                                                                                                                          | 1579/5680 [4:08:34<9:00:46,  7.91s/it] 28%|█████████████████████████████████████████████████████▍                                                                                                                                          | 1580/5680 [4:08:42<9:01:19,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.7628', 'grad_norm': '0.2553', 'learning_rate': '0.0001642', 'ppl': '2.144', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 12943360, 'tokens/trainable': 12811134, 'epoch': '2.046'}
 28%|█████████████████████████████████████████████████████▍                                                                                                                                          | 1580/5680 [4:08:42<9:01:19,  7.92s/it] 28%|█████████████████████████████████████████████████████▍                                                                                                                                          | 1581/5680 [4:08:50<9:01:03,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9536', 'grad_norm': '0.3107', 'learning_rate': '0.0001642', 'ppl': '2.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 12951552, 'tokens/trainable': 12819296, 'epoch': '2.046'}
 28%|█████████████████████████████████████████████████████▍                                                                                                                                          | 1581/5680 [4:08:50<9:01:03,  7.92s/it] 28%|█████████████████████████████████████████████████████▍                                                                                                                                          | 1582/5680 [4:08:58<9:01:34,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6087', 'grad_norm': '0.2548', 'learning_rate': '0.0001641', 'ppl': '1.838', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 12959744, 'tokens/trainable': 12827451, 'epoch': '2.046'}
 28%|█████████████████████████████████████████████████████▍                                                                                                                                          | 1582/5680 [4:08:58<9:01:34,  7.93s/it] 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1583/5680 [4:09:06<9:02:11,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6681', 'grad_norm': '0.2939', 'learning_rate': '0.0001641', 'ppl': '1.95', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 12967936, 'tokens/trainable': 12835592, 'epoch': '2.046'}
 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1583/5680 [4:09:06<9:02:11,  7.94s/it] 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1584/5680 [4:09:14<9:01:35,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6106', 'grad_norm': '0.2486', 'learning_rate': '0.0001641', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 12976128, 'tokens/trainable': 12843731, 'epoch': '2.047'}
 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1584/5680 [4:09:14<9:01:35,  7.93s/it] 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1585/5680 [4:09:22<9:02:33,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4211', 'grad_norm': '0.2284', 'learning_rate': '0.000164', 'ppl': '1.524', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 12984320, 'tokens/trainable': 12851852, 'epoch': '2.047'}
 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1585/5680 [4:09:22<9:02:33,  7.95s/it] 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1586/5680 [4:09:30<9:00:30,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.529', 'grad_norm': '0.2556', 'learning_rate': '0.000164', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 12992512, 'tokens/trainable': 12859984, 'epoch': '2.047'}
 28%|█████████████████████████████████████████████████████▌                                                                                                                                          | 1586/5680 [4:09:30<9:00:30,  7.92s/it] 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1587/5680 [4:09:38<8:58:45,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7968', 'grad_norm': '0.2839', 'learning_rate': '0.0001639', 'ppl': '2.218', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 13000704, 'tokens/trainable': 12868164, 'epoch': '2.047'}
 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1587/5680 [4:09:38<8:58:45,  7.90s/it] 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1588/5680 [4:09:46<8:57:17,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7843', 'grad_norm': '0.293', 'learning_rate': '0.0001639', 'ppl': '2.191', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 13008896, 'tokens/trainable': 12876269, 'epoch': '2.047'}
 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1588/5680 [4:09:46<8:57:17,  7.88s/it] 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1589/5680 [4:09:54<8:56:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5411', 'grad_norm': '0.271', 'learning_rate': '0.0001638', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 13017088, 'tokens/trainable': 12884454, 'epoch': '2.048'}
 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1589/5680 [4:09:54<8:56:52,  7.87s/it] 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1590/5680 [4:10:01<8:56:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6801', 'grad_norm': '0.2846', 'learning_rate': '0.0001638', 'ppl': '1.974', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 13025280, 'tokens/trainable': 12892540, 'epoch': '2.048'}
 28%|█████████████████████████████████████████████████████▋                                                                                                                                          | 1590/5680 [4:10:01<8:56:13,  7.87s/it] 28%|█████████████████████████████████████████████████████▊                                                                                                                                          | 1591/5680 [4:10:09<8:57:01,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6319', 'grad_norm': '0.2462', 'learning_rate': '0.0001638', 'ppl': '1.881', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13033472, 'tokens/trainable': 12900672, 'epoch': '2.048'}
 28%|█████████████████████████████████████████████████████▊                                                                                                                                          | 1591/5680 [4:10:09<8:57:01,  7.88s/it] 28%|█████████████████████████████████████████████████████▊                                                                                                                                          | 1592/5680 [4:10:17<8:56:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6459', 'grad_norm': '0.2595', 'learning_rate': '0.0001637', 'ppl': '1.908', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 13041664, 'tokens/trainable': 12908843, 'epoch': '2.048'}
 28%|█████████████████████████████████████████████████████▊                                                                                                                                          | 1592/5680 [4:10:17<8:56:34,  7.88s/it] 28%|█████████████████████████████████████████████████████▊                                                                                                                                          | 1593/5680 [4:10:25<8:57:39,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7181', 'grad_norm': '0.288', 'learning_rate': '0.0001637', 'ppl': '2.051', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13049856, 'tokens/trainable': 12916997, 'epoch': '2.048'}
 28%|█████████████████████████████████████████████████████▊                                                                                                                                          | 1593/5680 [4:10:25<8:57:39,  7.89s/it] 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1594/5680 [4:10:33<8:57:26,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4534', 'grad_norm': '0.2342', 'learning_rate': '0.0001636', 'ppl': '1.574', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13058048, 'tokens/trainable': 12925108, 'epoch': '2.048'}
 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1594/5680 [4:10:33<8:57:26,  7.89s/it] 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1595/5680 [4:10:41<8:56:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7017', 'grad_norm': '0.3145', 'learning_rate': '0.0001636', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 13066240, 'tokens/trainable': 12933230, 'epoch': '2.049'}
 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1595/5680 [4:10:41<8:56:25,  7.88s/it] 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1596/5680 [4:10:49<8:56:09,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6924', 'grad_norm': '0.2767', 'learning_rate': '0.0001635', 'ppl': '1.998', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 13074432, 'tokens/trainable': 12941373, 'epoch': '2.049'}
 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1596/5680 [4:10:49<8:56:09,  7.88s/it] 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1597/5680 [4:10:57<8:55:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6409', 'grad_norm': '0.2757', 'learning_rate': '0.0001635', 'ppl': '1.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 13082624, 'tokens/trainable': 12949471, 'epoch': '2.049'}
 28%|█████████████████████████████████████████████████████▉                                                                                                                                          | 1597/5680 [4:10:57<8:55:32,  7.87s/it] 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1598/5680 [4:11:05<8:56:51,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5899', 'grad_norm': '0.2719', 'learning_rate': '0.0001635', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 13090816, 'tokens/trainable': 12957546, 'epoch': '2.049'}
 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1598/5680 [4:11:05<8:56:51,  7.89s/it] 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1599/5680 [4:11:12<8:55:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6306', 'grad_norm': '0.2386', 'learning_rate': '0.0001634', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 13099008, 'tokens/trainable': 12965666, 'epoch': '2.049'}
 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1599/5680 [4:11:12<8:55:05,  7.87s/it] 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1600/5680 [4:11:21<9:01:18,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7051', 'grad_norm': '0.258', 'learning_rate': '0.0001634', 'ppl': '2.024', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.6', 'tokens/total': 13107200, 'tokens/trainable': 12973807, 'epoch': '2.049'}
 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1600/5680 [4:11:21<9:01:18,  7.96s/it] 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1601/5680 [4:11:28<8:59:39,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7061', 'grad_norm': '0.2695', 'learning_rate': '0.0001633', 'ppl': '2.026', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 13115392, 'tokens/trainable': 12981963, 'epoch': '2.05'}
 28%|██████████████████████████████████████████████████████                                                                                                                                          | 1601/5680 [4:11:28<8:59:39,  7.94s/it] 28%|██████████████████████████████████████████████████████▏                                                                                                                                         | 1602/5680 [4:11:36<8:58:20,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3526', 'grad_norm': '0.2089', 'learning_rate': '0.0001633', 'ppl': '1.423', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 13123584, 'tokens/trainable': 12990050, 'epoch': '2.05'}
 28%|██████████████████████████████████████████████████████▏                                                                                                                                         | 1602/5680 [4:11:36<8:58:20,  7.92s/it] 28%|██████████████████████████████████████████████████████▏                                                                                                                                         | 1603/5680 [4:11:44<8:58:42,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.617', 'grad_norm': '0.2585', 'learning_rate': '0.0001632', 'ppl': '1.853', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 13131776, 'tokens/trainable': 12998205, 'epoch': '2.05'}
 28%|██████████████████████████████████████████████████████▏                                                                                                                                         | 1603/5680 [4:11:44<8:58:42,  7.93s/it] 28%|██████████████████████████████████████████████████████▏                                                                                                                                         | 1604/5680 [4:11:52<8:58:47,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6515', 'grad_norm': '0.2616', 'learning_rate': '0.0001632', 'ppl': '1.918', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13139968, 'tokens/trainable': 13006364, 'epoch': '2.05'}
 28%|██████████████████████████████████████████████████████▏                                                                                                                                         | 1604/5680 [4:11:52<8:58:47,  7.93s/it] 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1605/5680 [4:12:00<8:59:14,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6073', 'grad_norm': '0.2211', 'learning_rate': '0.0001632', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 13148160, 'tokens/trainable': 13014520, 'epoch': '2.05'}
 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1605/5680 [4:12:00<8:59:14,  7.94s/it] 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1606/5680 [4:12:08<8:58:59,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6459', 'grad_norm': '0.2655', 'learning_rate': '0.0001631', 'ppl': '1.908', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 13156352, 'tokens/trainable': 13022691, 'epoch': '2.051'}
 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1606/5680 [4:12:08<8:58:59,  7.94s/it] 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1607/5680 [4:12:16<8:59:18,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4519', 'grad_norm': '0.2283', 'learning_rate': '0.0001631', 'ppl': '1.571', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 13164544, 'tokens/trainable': 13030850, 'epoch': '2.051'}
 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1607/5680 [4:12:16<8:59:18,  7.94s/it] 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1608/5680 [4:12:24<8:59:40,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7468', 'grad_norm': '0.2623', 'learning_rate': '0.000163', 'ppl': '2.11', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 13172736, 'tokens/trainable': 13038991, 'epoch': '2.051'}
 28%|██████████████████████████████████████████████████████▎                                                                                                                                         | 1608/5680 [4:12:24<8:59:40,  7.95s/it] 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1609/5680 [4:12:32<9:00:05,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8786', 'grad_norm': '0.3474', 'learning_rate': '0.000163', 'ppl': '2.408', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 13180928, 'tokens/trainable': 13047128, 'epoch': '2.051'}
 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1609/5680 [4:12:32<9:00:05,  7.96s/it] 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1610/5680 [4:12:40<8:59:55,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6576', 'grad_norm': '0.2915', 'learning_rate': '0.0001629', 'ppl': '1.93', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 13189120, 'tokens/trainable': 13055238, 'epoch': '2.051'}
 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1610/5680 [4:12:40<8:59:55,  7.96s/it] 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1611/5680 [4:12:48<8:58:45,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8564', 'grad_norm': '0.3065', 'learning_rate': '0.0001629', 'ppl': '2.355', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 13197312, 'tokens/trainable': 13063351, 'epoch': '2.051'}
 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1611/5680 [4:12:48<8:58:45,  7.94s/it] 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1612/5680 [4:12:56<8:57:05,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4403', 'grad_norm': '0.2397', 'learning_rate': '0.0001629', 'ppl': '1.553', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13205504, 'tokens/trainable': 13071435, 'epoch': '2.052'}
 28%|██████████████████████████████████████████████████████▍                                                                                                                                         | 1612/5680 [4:12:56<8:57:05,  7.92s/it] 28%|██████████████████████████████████████████████████████▌                                                                                                                                         | 1613/5680 [4:13:04<8:56:08,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5831', 'grad_norm': '0.2736', 'learning_rate': '0.0001628', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 13213696, 'tokens/trainable': 13079551, 'epoch': '2.052'}
 28%|██████████████████████████████████████████████████████▌                                                                                                                                         | 1613/5680 [4:13:04<8:56:08,  7.91s/it] 28%|██████████████████████████████████████████████████████▌                                                                                                                                         | 1614/5680 [4:13:11<8:55:32,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7712', 'grad_norm': '0.3136', 'learning_rate': '0.0001628', 'ppl': '2.162', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 13221888, 'tokens/trainable': 13087694, 'epoch': '2.052'}
 28%|██████████████████████████████████████████████████████▌                                                                                                                                         | 1614/5680 [4:13:11<8:55:32,  7.90s/it] 28%|██████████████████████████████████████████████████████▌                                                                                                                                         | 1615/5680 [4:13:19<8:56:03,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5982', 'grad_norm': '0.2754', 'learning_rate': '0.0001627', 'ppl': '1.819', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 13230080, 'tokens/trainable': 13095872, 'epoch': '2.052'}
 28%|██████████████████████████████████████████████████████▌                                                                                                                                         | 1615/5680 [4:13:19<8:56:03,  7.91s/it] 28%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1616/5680 [4:13:28<9:01:42,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.467', 'grad_norm': '0.2403', 'learning_rate': '0.0001627', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991', 'tokens/total': 13238272, 'tokens/trainable': 13103993, 'epoch': '2.052'}
 28%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1616/5680 [4:13:28<9:01:42,  8.00s/it] 28%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1617/5680 [4:13:36<8:59:34,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4828', 'grad_norm': '0.2475', 'learning_rate': '0.0001626', 'ppl': '1.621', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 13246464, 'tokens/trainable': 13112105, 'epoch': '2.052'}
 28%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1617/5680 [4:13:36<8:59:34,  7.97s/it] 28%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1618/5680 [4:13:43<8:58:13,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4938', 'grad_norm': '0.2337', 'learning_rate': '0.0001626', 'ppl': '1.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 13254656, 'tokens/trainable': 13120271, 'epoch': '2.053'}
 28%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1618/5680 [4:13:43<8:58:13,  7.95s/it] 29%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1619/5680 [4:13:51<8:58:09,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6635', 'grad_norm': '0.2629', 'learning_rate': '0.0001626', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 13262848, 'tokens/trainable': 13128402, 'epoch': '2.053'}
 29%|██████████████████████████████████████████████████████▋                                                                                                                                         | 1619/5680 [4:13:51<8:58:09,  7.95s/it] 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1620/5680 [4:13:59<8:58:17,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7056', 'grad_norm': '0.3234', 'learning_rate': '0.0001625', 'ppl': '2.025', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 13271040, 'tokens/trainable': 13136527, 'epoch': '2.053'}
 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1620/5680 [4:13:59<8:58:17,  7.96s/it] 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1621/5680 [4:14:07<8:57:29,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.9496', 'grad_norm': '0.3109', 'learning_rate': '0.0001625', 'ppl': '2.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 13279232, 'tokens/trainable': 13144688, 'epoch': '2.053'}
 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1621/5680 [4:14:07<8:57:29,  7.95s/it] 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1622/5680 [4:14:15<8:56:44,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5965', 'grad_norm': '0.264', 'learning_rate': '0.0001624', 'ppl': '1.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 13287424, 'tokens/trainable': 13152805, 'epoch': '2.053'}
 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1622/5680 [4:14:15<8:56:44,  7.94s/it] 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1623/5680 [4:14:23<8:55:29,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5181', 'grad_norm': '0.2381', 'learning_rate': '0.0001624', 'ppl': '1.679', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 13295616, 'tokens/trainable': 13160975, 'epoch': '2.054'}
 29%|██████████████████████████████████████████████████████▊                                                                                                                                         | 1623/5680 [4:14:23<8:55:29,  7.92s/it] 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1624/5680 [4:14:31<8:54:16,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6987', 'grad_norm': '0.2706', 'learning_rate': '0.0001623', 'ppl': '2.011', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 13303808, 'tokens/trainable': 13169152, 'epoch': '2.054'}
 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1624/5680 [4:14:31<8:54:16,  7.90s/it] 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1625/5680 [4:14:39<8:53:45,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5141', 'grad_norm': '0.2568', 'learning_rate': '0.0001623', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13312000, 'tokens/trainable': 13177284, 'epoch': '2.054'}
 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1625/5680 [4:14:39<8:53:45,  7.90s/it] 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1626/5680 [4:14:47<8:52:43,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8928', 'grad_norm': '0.2794', 'learning_rate': '0.0001623', 'ppl': '2.442', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 13320192, 'tokens/trainable': 13185421, 'epoch': '2.054'}
 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1626/5680 [4:14:47<8:52:43,  7.88s/it] 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1627/5680 [4:14:55<8:53:40,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4439', 'grad_norm': '0.2702', 'learning_rate': '0.0001622', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13328384, 'tokens/trainable': 13193609, 'epoch': '2.054'}
 29%|██████████████████████████████████████████████████████▉                                                                                                                                         | 1627/5680 [4:14:55<8:53:40,  7.90s/it] 29%|███████████████████████████████████████████████████████                                                                                                                                         | 1628/5680 [4:15:03<8:54:54,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9545', 'grad_norm': '0.2937', 'learning_rate': '0.0001622', 'ppl': '2.597', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 13336576, 'tokens/trainable': 13201767, 'epoch': '2.054'}
 29%|███████████████████████████████████████████████████████                                                                                                                                         | 1628/5680 [4:15:03<8:54:54,  7.92s/it] 29%|███████████████████████████████████████████████████████                                                                                                                                         | 1629/5680 [4:15:11<8:55:40,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6891', 'grad_norm': '0.2728', 'learning_rate': '0.0001621', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 13344768, 'tokens/trainable': 13209889, 'epoch': '2.055'}
 29%|███████████████████████████████████████████████████████                                                                                                                                         | 1629/5680 [4:15:11<8:55:40,  7.93s/it] 29%|███████████████████████████████████████████████████████                                                                                                                                         | 1630/5680 [4:15:18<8:54:48,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5626', 'grad_norm': '0.2518', 'learning_rate': '0.0001621', 'ppl': '1.755', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 13352960, 'tokens/trainable': 13218016, 'epoch': '2.055'}
 29%|███████████████████████████████████████████████████████                                                                                                                                         | 1630/5680 [4:15:18<8:54:48,  7.92s/it] 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1631/5680 [4:15:26<8:53:41,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6736', 'grad_norm': '0.2961', 'learning_rate': '0.000162', 'ppl': '1.961', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 13361152, 'tokens/trainable': 13226127, 'epoch': '2.055'}
 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1631/5680 [4:15:26<8:53:41,  7.91s/it] 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1632/5680 [4:15:34<8:54:20,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6287', 'grad_norm': '0.2647', 'learning_rate': '0.000162', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13369344, 'tokens/trainable': 13234291, 'epoch': '2.055'}
 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1632/5680 [4:15:34<8:54:20,  7.92s/it] 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1633/5680 [4:15:42<8:53:03,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5325', 'grad_norm': '0.2828', 'learning_rate': '0.000162', 'ppl': '1.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 13377536, 'tokens/trainable': 13242449, 'epoch': '2.055'}
 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1633/5680 [4:15:42<8:53:03,  7.90s/it] 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1634/5680 [4:15:50<8:51:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '1.003', 'grad_norm': '0.3018', 'learning_rate': '0.0001619', 'ppl': '2.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 13385728, 'tokens/trainable': 13250570, 'epoch': '2.055'}
 29%|███████████████████████████████████████████████████████▏                                                                                                                                        | 1634/5680 [4:15:50<8:51:44,  7.89s/it] 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1635/5680 [4:15:58<8:50:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7022', 'grad_norm': '0.3166', 'learning_rate': '0.0001619', 'ppl': '2.018', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 13393920, 'tokens/trainable': 13258733, 'epoch': '2.056'}
 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1635/5680 [4:15:58<8:50:45,  7.87s/it] 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1636/5680 [4:16:06<8:50:41,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7295', 'grad_norm': '0.2985', 'learning_rate': '0.0001618', 'ppl': '2.074', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13402112, 'tokens/trainable': 13266862, 'epoch': '2.056'}
 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1636/5680 [4:16:06<8:50:41,  7.87s/it] 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1637/5680 [4:16:14<8:50:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5976', 'grad_norm': '0.2796', 'learning_rate': '0.0001618', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 13410304, 'tokens/trainable': 13274991, 'epoch': '2.056'}
 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1637/5680 [4:16:14<8:50:29,  7.87s/it] 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1638/5680 [4:16:21<8:50:55,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.497', 'grad_norm': '0.2413', 'learning_rate': '0.0001617', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 13418496, 'tokens/trainable': 13283162, 'epoch': '2.056'}
 29%|███████████████████████████████████████████████████████▎                                                                                                                                        | 1638/5680 [4:16:21<8:50:55,  7.88s/it] 29%|███████████████████████████████████████████████████████▍                                                                                                                                        | 1639/5680 [4:16:29<8:51:49,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.435', 'grad_norm': '0.2273', 'learning_rate': '0.0001617', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 13426688, 'tokens/trainable': 13291334, 'epoch': '2.056'}
 29%|███████████████████████████████████████████████████████▍                                                                                                                                        | 1639/5680 [4:16:29<8:51:49,  7.90s/it] 29%|███████████████████████████████████████████████████████▍                                                                                                                                        | 1640/5680 [4:16:37<8:50:47,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9518', 'grad_norm': '0.2701', 'learning_rate': '0.0001616', 'ppl': '2.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 13434880, 'tokens/trainable': 13299468, 'epoch': '2.057'}
 29%|███████████████████████████████████████████████████████▍                                                                                                                                        | 1640/5680 [4:16:37<8:50:47,  7.88s/it] 29%|███████████████████████████████████████████████████████▍                                                                                                                                        | 1641/5680 [4:16:45<8:50:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5392', 'grad_norm': '0.3109', 'learning_rate': '0.0001616', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 13443072, 'tokens/trainable': 13307639, 'epoch': '2.057'}
 29%|███████████████████████████████████████████████████████▍                                                                                                                                        | 1641/5680 [4:16:45<8:50:11,  7.88s/it] 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1642/5680 [4:16:53<8:49:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5466', 'grad_norm': '0.2652', 'learning_rate': '0.0001616', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 13451264, 'tokens/trainable': 13315811, 'epoch': '2.057'}
 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1642/5680 [4:16:53<8:49:54,  7.87s/it] 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1643/5680 [4:17:01<8:52:12,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5194', 'grad_norm': '0.2933', 'learning_rate': '0.0001615', 'ppl': '1.681', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 13459456, 'tokens/trainable': 13323967, 'epoch': '2.057'}
 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1643/5680 [4:17:01<8:52:12,  7.91s/it] 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1644/5680 [4:17:09<8:51:33,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8858', 'grad_norm': '0.2771', 'learning_rate': '0.0001615', 'ppl': '2.425', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 13467648, 'tokens/trainable': 13332149, 'epoch': '2.057'}
 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1644/5680 [4:17:09<8:51:33,  7.90s/it] 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1645/5680 [4:17:17<8:51:42,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5505', 'grad_norm': '0.2985', 'learning_rate': '0.0001614', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 13475840, 'tokens/trainable': 13340335, 'epoch': '2.057'}
 29%|███████████████████████████████████████████████████████▌                                                                                                                                        | 1645/5680 [4:17:17<8:51:42,  7.91s/it] 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1646/5680 [4:17:25<8:51:17,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6725', 'grad_norm': '0.2773', 'learning_rate': '0.0001614', 'ppl': '1.959', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 13484032, 'tokens/trainable': 13348496, 'epoch': '2.058'}
 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1646/5680 [4:17:25<8:51:17,  7.90s/it] 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1647/5680 [4:17:33<8:51:27,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5392', 'grad_norm': '0.2571', 'learning_rate': '0.0001613', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 13492224, 'tokens/trainable': 13356576, 'epoch': '2.058'}
 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1647/5680 [4:17:33<8:51:27,  7.91s/it] 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1648/5680 [4:17:40<8:50:17,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4769', 'grad_norm': '0.2466', 'learning_rate': '0.0001613', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 13500416, 'tokens/trainable': 13364755, 'epoch': '2.058'}
 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1648/5680 [4:17:40<8:50:17,  7.89s/it] 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1649/5680 [4:17:48<8:51:11,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5588', 'grad_norm': '0.2957', 'learning_rate': '0.0001613', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 13508608, 'tokens/trainable': 13372874, 'epoch': '2.058'}
 29%|███████████████████████████████████████████████████████▋                                                                                                                                        | 1649/5680 [4:17:48<8:51:11,  7.91s/it] 29%|███████████████████████████████████████████████████████▊                                                                                                                                        | 1650/5680 [4:17:56<8:51:48,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.497', 'grad_norm': '0.6442', 'learning_rate': '0.0001612', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 13516800, 'tokens/trainable': 13381012, 'epoch': '2.058'}
 29%|███████████████████████████████████████████████████████▊                                                                                                                                        | 1650/5680 [4:17:56<8:51:48,  7.92s/it] 29%|███████████████████████████████████████████████████████▊                                                                                                                                        | 1651/5680 [4:18:04<8:53:05,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6171', 'grad_norm': '0.2861', 'learning_rate': '0.0001612', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 13524992, 'tokens/trainable': 13389135, 'epoch': '2.058'}
 29%|███████████████████████████████████████████████████████▊                                                                                                                                        | 1651/5680 [4:18:04<8:53:05,  7.94s/it] 29%|███████████████████████████████████████████████████████▊                                                                                                                                        | 1652/5680 [4:18:12<8:52:09,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.575', 'grad_norm': '0.2873', 'learning_rate': '0.0001611', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 13533184, 'tokens/trainable': 13397313, 'epoch': '2.059'}
 29%|███████████████████████████████████████████████████████▊                                                                                                                                        | 1652/5680 [4:18:12<8:52:09,  7.93s/it] 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1653/5680 [4:18:20<8:51:57,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5637', 'grad_norm': '0.288', 'learning_rate': '0.0001611', 'ppl': '1.757', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 13541376, 'tokens/trainable': 13405438, 'epoch': '2.059'}
 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1653/5680 [4:18:20<8:51:57,  7.93s/it] 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1654/5680 [4:18:28<8:51:19,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5671', 'grad_norm': '0.2647', 'learning_rate': '0.000161', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 13549568, 'tokens/trainable': 13413617, 'epoch': '2.059'}
 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1654/5680 [4:18:28<8:51:19,  7.92s/it] 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1655/5680 [4:18:36<8:50:51,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5483', 'grad_norm': '0.2295', 'learning_rate': '0.000161', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 13557760, 'tokens/trainable': 13421758, 'epoch': '2.059'}
 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1655/5680 [4:18:36<8:50:51,  7.91s/it] 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1656/5680 [4:18:44<8:50:12,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6327', 'grad_norm': '0.2531', 'learning_rate': '0.0001609', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 13565952, 'tokens/trainable': 13429907, 'epoch': '2.059'}
 29%|███████████████████████████████████████████████████████▉                                                                                                                                        | 1656/5680 [4:18:44<8:50:12,  7.91s/it] 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1657/5680 [4:18:52<8:50:24,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5276', 'grad_norm': '0.2404', 'learning_rate': '0.0001609', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 13574144, 'tokens/trainable': 13437989, 'epoch': '2.06'}
 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1657/5680 [4:18:52<8:50:24,  7.91s/it] 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1658/5680 [4:19:00<8:50:04,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4575', 'grad_norm': '0.2268', 'learning_rate': '0.0001609', 'ppl': '1.58', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13582336, 'tokens/trainable': 13446137, 'epoch': '2.06'}
 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1658/5680 [4:19:00<8:50:04,  7.91s/it] 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1659/5680 [4:19:08<8:57:23,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4084', 'grad_norm': '0.2338', 'learning_rate': '0.0001608', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.8', 'tokens/total': 13590528, 'tokens/trainable': 13454283, 'epoch': '2.06'}
 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1659/5680 [4:19:08<8:57:23,  8.02s/it] 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1660/5680 [4:19:16<8:56:07,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5976', 'grad_norm': '0.2743', 'learning_rate': '0.0001608', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 13598720, 'tokens/trainable': 13462415, 'epoch': '2.06'}
 29%|████████████████████████████████████████████████████████                                                                                                                                        | 1660/5680 [4:19:16<8:56:07,  8.00s/it] 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1661/5680 [4:19:24<8:54:45,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6023', 'grad_norm': '0.294', 'learning_rate': '0.0001607', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 13606912, 'tokens/trainable': 13470550, 'epoch': '2.06'}
 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1661/5680 [4:19:24<8:54:45,  7.98s/it] 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1662/5680 [4:19:32<8:53:30,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7376', 'grad_norm': '0.2654', 'learning_rate': '0.0001607', 'ppl': '2.091', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 13615104, 'tokens/trainable': 13478724, 'epoch': '2.06'}
 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1662/5680 [4:19:32<8:53:30,  7.97s/it] 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1663/5680 [4:19:40<8:53:15,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6381', 'grad_norm': '0.2625', 'learning_rate': '0.0001606', 'ppl': '1.893', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 13623296, 'tokens/trainable': 13486801, 'epoch': '2.061'}
 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1663/5680 [4:19:40<8:53:15,  7.97s/it] 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1664/5680 [4:19:48<8:53:10,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6913', 'grad_norm': '0.2823', 'learning_rate': '0.0001606', 'ppl': '1.996', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 13631488, 'tokens/trainable': 13494979, 'epoch': '2.061'}
 29%|████████████████████████████████████████████████████████▏                                                                                                                                       | 1664/5680 [4:19:48<8:53:10,  7.97s/it] 29%|████████████████████████████████████████████████████████▎                                                                                                                                       | 1665/5680 [4:19:56<8:52:19,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6105', 'grad_norm': '0.2715', 'learning_rate': '0.0001606', 'ppl': '1.841', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 13639680, 'tokens/trainable': 13503133, 'epoch': '2.061'}
 29%|████████████████████████████████████████████████████████▎                                                                                                                                       | 1665/5680 [4:19:56<8:52:19,  7.96s/it] 29%|████████████████████████████████████████████████████████▎                                                                                                                                       | 1666/5680 [4:20:03<8:50:44,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6734', 'grad_norm': '0.3038', 'learning_rate': '0.0001605', 'ppl': '1.961', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 13647872, 'tokens/trainable': 13511318, 'epoch': '2.061'}
 29%|████████████████████████████████████████████████████████▎                                                                                                                                       | 1666/5680 [4:20:03<8:50:44,  7.93s/it] 29%|████████████████████████████████████████████████████████▎                                                                                                                                       | 1667/5680 [4:20:11<8:49:13,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7677', 'grad_norm': '0.2767', 'learning_rate': '0.0001605', 'ppl': '2.155', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 13656064, 'tokens/trainable': 13519491, 'epoch': '2.061'}
 29%|████████████████████████████████████████████████████████▎                                                                                                                                       | 1667/5680 [4:20:11<8:49:13,  7.91s/it] 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1668/5680 [4:20:19<8:48:03,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8535', 'grad_norm': '0.283', 'learning_rate': '0.0001604', 'ppl': '2.348', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 13664256, 'tokens/trainable': 13527662, 'epoch': '2.061'}
 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1668/5680 [4:20:19<8:48:03,  7.90s/it] 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1669/5680 [4:20:27<8:47:40,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.832', 'grad_norm': '0.2688', 'learning_rate': '0.0001604', 'ppl': '2.298', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13672448, 'tokens/trainable': 13535800, 'epoch': '2.062'}
 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1669/5680 [4:20:27<8:47:40,  7.89s/it] 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1670/5680 [4:20:35<8:46:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4819', 'grad_norm': '0.2603', 'learning_rate': '0.0001603', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 13680640, 'tokens/trainable': 13543938, 'epoch': '2.062'}
 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1670/5680 [4:20:35<8:46:30,  7.88s/it] 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1671/5680 [4:20:43<8:46:45,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4607', 'grad_norm': '0.2375', 'learning_rate': '0.0001603', 'ppl': '1.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 13688832, 'tokens/trainable': 13552029, 'epoch': '2.062'}
 29%|████████████████████████████████████████████████████████▍                                                                                                                                       | 1671/5680 [4:20:43<8:46:45,  7.88s/it] 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1672/5680 [4:20:51<8:46:12,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8141', 'grad_norm': '0.2763', 'learning_rate': '0.0001602', 'ppl': '2.257', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 13697024, 'tokens/trainable': 13560157, 'epoch': '2.062'}
 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1672/5680 [4:20:51<8:46:12,  7.88s/it] 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1673/5680 [4:20:59<8:47:36,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5465', 'grad_norm': '0.3323', 'learning_rate': '0.0001602', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 13705216, 'tokens/trainable': 13568326, 'epoch': '2.062'}
 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1673/5680 [4:20:59<8:47:36,  7.90s/it] 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1674/5680 [4:21:07<8:48:07,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.563', 'grad_norm': '0.26', 'learning_rate': '0.0001602', 'ppl': '1.756', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13713408, 'tokens/trainable': 13576507, 'epoch': '2.062'}
 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1674/5680 [4:21:07<8:48:07,  7.91s/it] 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1675/5680 [4:21:15<8:49:00,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5398', 'grad_norm': '0.2686', 'learning_rate': '0.0001601', 'ppl': '1.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 13721600, 'tokens/trainable': 13584666, 'epoch': '2.063'}
 29%|████████████████████████████████████████████████████████▌                                                                                                                                       | 1675/5680 [4:21:15<8:49:00,  7.93s/it] 30%|████████████████████████████████████████████████████████▋                                                                                                                                       | 1676/5680 [4:21:22<8:49:13,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7029', 'grad_norm': '0.2708', 'learning_rate': '0.0001601', 'ppl': '2.02', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 13729792, 'tokens/trainable': 13592756, 'epoch': '2.063'}
 30%|████████████████████████████████████████████████████████▋                                                                                                                                       | 1676/5680 [4:21:22<8:49:13,  7.93s/it] 30%|████████████████████████████████████████████████████████▋                                                                                                                                       | 1677/5680 [4:21:30<8:49:12,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6008', 'grad_norm': '0.2655', 'learning_rate': '0.00016', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 13737984, 'tokens/trainable': 13600943, 'epoch': '2.063'}
 30%|████████████████████████████████████████████████████████▋                                                                                                                                       | 1677/5680 [4:21:30<8:49:12,  7.93s/it] 30%|████████████████████████████████████████████████████████▋                                                                                                                                       | 1678/5680 [4:21:38<8:50:05,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.681', 'grad_norm': '0.3029', 'learning_rate': '0.00016', 'ppl': '1.976', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 13746176, 'tokens/trainable': 13609061, 'epoch': '2.063'}
 30%|████████████████████████████████████████████████████████▋                                                                                                                                       | 1678/5680 [4:21:38<8:50:05,  7.95s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1679/5680 [4:21:46<8:49:08,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8109', 'grad_norm': '0.2512', 'learning_rate': '0.0001599', 'ppl': '2.25', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 13754368, 'tokens/trainable': 13617243, 'epoch': '2.063'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1679/5680 [4:21:46<8:49:08,  7.94s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1680/5680 [4:21:54<8:49:06,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7067', 'grad_norm': '0.2651', 'learning_rate': '0.0001599', 'ppl': '2.027', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 13762560, 'tokens/trainable': 13625396, 'epoch': '2.064'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1680/5680 [4:21:54<8:49:06,  7.94s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1681/5680 [4:22:02<8:49:47,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5632', 'grad_norm': '0.2432', 'learning_rate': '0.0001598', 'ppl': '1.756', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 13770752, 'tokens/trainable': 13633562, 'epoch': '2.064'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1681/5680 [4:22:02<8:49:47,  7.95s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1682/5680 [4:22:13<9:56:13,  8.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7873', 'grad_norm': '0.3248', 'learning_rate': '0.0001598', 'ppl': '2.197', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '720.3', 'tokens/total': 13778944, 'tokens/trainable': 13641684, 'epoch': '2.064'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                       | 1682/5680 [4:22:13<9:56:13,  8.95s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1683/5680 [4:22:21<9:36:35,  8.66s/it]                                                                                                                                                                                                                                             {'loss': '0.698', 'grad_norm': '0.2834', 'learning_rate': '0.0001598', 'ppl': '2.01', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 13787136, 'tokens/trainable': 13649805, 'epoch': '2.064'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1683/5680 [4:22:21<9:36:35,  8.66s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1684/5680 [4:22:29<9:23:01,  8.45s/it]                                                                                                                                                                                                                                             {'loss': '0.4421', 'grad_norm': '0.2478', 'learning_rate': '0.0001597', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 13795328, 'tokens/trainable': 13657878, 'epoch': '2.064'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1684/5680 [4:22:29<9:23:01,  8.45s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1685/5680 [4:22:37<9:12:47,  8.30s/it]                                                                                                                                                                                                                                             {'loss': '1.036', 'grad_norm': '0.3103', 'learning_rate': '0.0001597', 'ppl': '2.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 13803520, 'tokens/trainable': 13666020, 'epoch': '2.064'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1685/5680 [4:22:37<9:12:47,  8.30s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1686/5680 [4:22:47<9:40:51,  8.73s/it]                                                                                                                                                                                                                                             {'loss': '0.7495', 'grad_norm': '0.3358', 'learning_rate': '0.0001596', 'ppl': '2.116', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '841.7', 'tokens/total': 13811712, 'tokens/trainable': 13674195, 'epoch': '2.065'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                       | 1686/5680 [4:22:47<9:40:51,  8.73s/it] 30%|████████████████████████████████████████████████████████▋                                                                                                                                      | 1687/5680 [4:23:00<11:00:47,  9.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7264', 'grad_norm': '0.2714', 'learning_rate': '0.0001596', 'ppl': '2.068', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '641.6', 'tokens/total': 13819904, 'tokens/trainable': 13682362, 'epoch': '2.065'}
 30%|████████████████████████████████████████████████████████▋                                                                                                                                      | 1687/5680 [4:23:00<11:00:47,  9.93s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1688/5680 [4:23:11<11:30:54, 10.38s/it]                                                                                                                                                                                                                                             {'loss': '0.7473', 'grad_norm': '0.2637', 'learning_rate': '0.0001595', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '711.5', 'tokens/total': 13828096, 'tokens/trainable': 13690500, 'epoch': '2.065'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1688/5680 [4:23:11<11:30:54, 10.38s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1689/5680 [4:23:23<11:48:13, 10.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5493', 'grad_norm': '0.2588', 'learning_rate': '0.0001595', 'ppl': '1.732', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '719.1', 'tokens/total': 13836288, 'tokens/trainable': 13698589, 'epoch': '2.065'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1689/5680 [4:23:23<11:48:13, 10.65s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1690/5680 [4:23:35<12:25:11, 11.21s/it]                                                                                                                                                                                                                                             {'loss': '0.7369', 'grad_norm': '0.3015', 'learning_rate': '0.0001594', 'ppl': '2.089', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.3', 'tokens/total': 13844480, 'tokens/trainable': 13706732, 'epoch': '2.065'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1690/5680 [4:23:35<12:25:11, 11.21s/it] 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1691/5680 [4:23:46<12:21:55, 11.16s/it]                                                                                                                                                                                                                                             {'loss': '0.7508', 'grad_norm': '0.3208', 'learning_rate': '0.0001594', 'ppl': '2.119', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '740.3', 'tokens/total': 13852672, 'tokens/trainable': 13714908, 'epoch': '2.065'}
 30%|████████████████████████████████████████████████████████▊                                                                                                                                      | 1691/5680 [4:23:46<12:21:55, 11.16s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1692/5680 [4:23:58<12:29:47, 11.28s/it]                                                                                                                                                                                                                                             {'loss': '0.691', 'grad_norm': '0.2605', 'learning_rate': '0.0001594', 'ppl': '1.996', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '704.1', 'tokens/total': 13860864, 'tokens/trainable': 13723048, 'epoch': '2.066'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1692/5680 [4:23:58<12:29:47, 11.28s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1693/5680 [4:24:10<12:55:56, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.5195', 'grad_norm': '0.2501', 'learning_rate': '0.0001593', 'ppl': '1.681', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.5', 'tokens/total': 13869056, 'tokens/trainable': 13731164, 'epoch': '2.066'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1693/5680 [4:24:10<12:55:56, 11.68s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1694/5680 [4:24:21<12:36:10, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.8014', 'grad_norm': '0.3289', 'learning_rate': '0.0001593', 'ppl': '2.229', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '764', 'tokens/total': 13877248, 'tokens/trainable': 13739332, 'epoch': '2.066'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1694/5680 [4:24:21<12:36:10, 11.38s/it] 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1695/5680 [4:24:33<12:44:50, 11.52s/it]                                                                                                                                                                                                                                             {'loss': '0.7446', 'grad_norm': '0.2792', 'learning_rate': '0.0001592', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '689.7', 'tokens/total': 13885440, 'tokens/trainable': 13747480, 'epoch': '2.066'}
 30%|████████████████████████████████████████████████████████▉                                                                                                                                      | 1695/5680 [4:24:33<12:44:50, 11.52s/it] 30%|█████████████████████████████████████████████████████████                                                                                                                                      | 1696/5680 [4:24:45<13:05:25, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.7013', 'grad_norm': '0.2951', 'learning_rate': '0.0001592', 'ppl': '2.016', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650.2', 'tokens/total': 13893632, 'tokens/trainable': 13755645, 'epoch': '2.066'}
 30%|█████████████████████████████████████████████████████████                                                                                                                                      | 1696/5680 [4:24:45<13:05:25, 11.83s/it] 30%|█████████████████████████████████████████████████████████                                                                                                                                      | 1697/5680 [4:24:56<12:35:37, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.6886', 'grad_norm': '0.2463', 'learning_rate': '0.0001591', 'ppl': '1.991', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '791.2', 'tokens/total': 13901824, 'tokens/trainable': 13763822, 'epoch': '2.067'}
 30%|█████████████████████████████████████████████████████████                                                                                                                                      | 1697/5680 [4:24:56<12:35:37, 11.38s/it] 30%|█████████████████████████████████████████████████████████                                                                                                                                      | 1698/5680 [4:25:08<12:51:00, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.6856', 'grad_norm': '0.3039', 'learning_rate': '0.0001591', 'ppl': '1.985', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '669', 'tokens/total': 13910016, 'tokens/trainable': 13771959, 'epoch': '2.067'}
 30%|█████████████████████████████████████████████████████████                                                                                                                                      | 1698/5680 [4:25:08<12:51:00, 11.62s/it] 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1699/5680 [4:25:20<13:05:42, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.8353', 'grad_norm': '0.4299', 'learning_rate': '0.000159', 'ppl': '2.305', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656.1', 'tokens/total': 13918208, 'tokens/trainable': 13780069, 'epoch': '2.067'}
 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1699/5680 [4:25:20<13:05:42, 11.84s/it] 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1700/5680 [4:25:31<12:34:53, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.7477', 'grad_norm': '0.2766', 'learning_rate': '0.000159', 'ppl': '2.112', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '793.9', 'tokens/total': 13926400, 'tokens/trainable': 13788240, 'epoch': '2.067'}
 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1700/5680 [4:25:31<12:34:53, 11.38s/it] 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1701/5680 [4:25:43<12:56:20, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.5405', 'grad_norm': '0.2314', 'learning_rate': '0.000159', 'ppl': '1.717', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.4', 'tokens/total': 13934592, 'tokens/trainable': 13796369, 'epoch': '2.067'}
 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1701/5680 [4:25:43<12:56:20, 11.71s/it] 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1702/5680 [4:25:55<13:09:52, 11.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5707', 'grad_norm': '0.2392', 'learning_rate': '0.0001589', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '657.1', 'tokens/total': 13942784, 'tokens/trainable': 13804512, 'epoch': '2.067'}
 30%|█████████████████████████████████████████████████████████▏                                                                                                                                     | 1702/5680 [4:25:55<13:09:52, 11.91s/it] 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1703/5680 [4:26:06<12:41:40, 11.49s/it]                                                                                                                                                                                                                                             {'loss': '0.6006', 'grad_norm': '0.2387', 'learning_rate': '0.0001589', 'ppl': '1.823', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '776.8', 'tokens/total': 13950976, 'tokens/trainable': 13812661, 'epoch': '2.068'}
 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1703/5680 [4:26:06<12:41:40, 11.49s/it] 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1704/5680 [4:26:18<13:01:47, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.5665', 'grad_norm': '0.2701', 'learning_rate': '0.0001588', 'ppl': '1.762', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.7', 'tokens/total': 13959168, 'tokens/trainable': 13820762, 'epoch': '2.068'}
 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1704/5680 [4:26:18<13:01:47, 11.80s/it] 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1705/5680 [4:26:30<13:02:01, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.8479', 'grad_norm': '0.314', 'learning_rate': '0.0001588', 'ppl': '2.335', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '692', 'tokens/total': 13967360, 'tokens/trainable': 13828935, 'epoch': '2.068'}
 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1705/5680 [4:26:30<13:02:01, 11.80s/it] 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1706/5680 [4:26:41<12:42:28, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.604', 'grad_norm': '0.2545', 'learning_rate': '0.0001587', 'ppl': '1.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '753.9', 'tokens/total': 13975552, 'tokens/trainable': 13837073, 'epoch': '2.068'}
 30%|█████████████████████████████████████████████████████████▎                                                                                                                                     | 1706/5680 [4:26:41<12:42:28, 11.51s/it] 30%|█████████████████████████████████████████████████████████▍                                                                                                                                     | 1707/5680 [4:26:54<13:01:26, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.577', 'grad_norm': '0.2515', 'learning_rate': '0.0001587', 'ppl': '1.781', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.4', 'tokens/total': 13983744, 'tokens/trainable': 13845208, 'epoch': '2.068'}
 30%|█████████████████████████████████████████████████████████▍                                                                                                                                     | 1707/5680 [4:26:54<13:01:26, 11.80s/it] 30%|█████████████████████████████████████████████████████████▍                                                                                                                                     | 1708/5680 [4:27:05<12:51:18, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5016', 'grad_norm': '0.2312', 'learning_rate': '0.0001586', 'ppl': '1.651', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '716.8', 'tokens/total': 13991936, 'tokens/trainable': 13853307, 'epoch': '2.068'}
 30%|█████████████████████████████████████████████████████████▍                                                                                                                                     | 1708/5680 [4:27:05<12:51:18, 11.65s/it] 30%|█████████████████████████████████████████████████████████▍                                                                                                                                     | 1709/5680 [4:27:16<12:42:21, 11.52s/it]                                                                                                                                                                                                                                             {'loss': '0.6026', 'grad_norm': '0.3466', 'learning_rate': '0.0001586', 'ppl': '1.827', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '725.8', 'tokens/total': 14000128, 'tokens/trainable': 13861433, 'epoch': '2.069'}
 30%|█████████████████████████████████████████████████████████▍                                                                                                                                     | 1709/5680 [4:27:16<12:42:21, 11.52s/it] 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1710/5680 [4:27:29<13:01:53, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.6596', 'grad_norm': '0.2327', 'learning_rate': '0.0001586', 'ppl': '1.934', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.8', 'tokens/total': 14008320, 'tokens/trainable': 13869597, 'epoch': '2.069'}
 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1710/5680 [4:27:29<13:01:53, 11.82s/it] 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1711/5680 [4:27:39<12:42:17, 11.52s/it]                                                                                                                                                                                                                                             {'loss': '0.5203', 'grad_norm': '0.2513', 'learning_rate': '0.0001585', 'ppl': '1.682', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '753.7', 'tokens/total': 14016512, 'tokens/trainable': 13877760, 'epoch': '2.069'}
 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1711/5680 [4:27:39<12:42:17, 11.52s/it] 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1712/5680 [4:27:51<12:43:16, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.6627', 'grad_norm': '0.2477', 'learning_rate': '0.0001585', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.5', 'tokens/total': 14024704, 'tokens/trainable': 13885878, 'epoch': '2.069'}
 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1712/5680 [4:27:51<12:43:16, 11.54s/it] 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1713/5680 [4:28:04<13:03:44, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5597', 'grad_norm': '0.2425', 'learning_rate': '0.0001584', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '646.4', 'tokens/total': 14032896, 'tokens/trainable': 13894007, 'epoch': '2.069'}
 30%|█████████████████████████████████████████████████████████▌                                                                                                                                     | 1713/5680 [4:28:04<13:03:44, 11.85s/it] 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1714/5680 [4:28:14<12:38:55, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.3702', 'grad_norm': '0.3237', 'learning_rate': '0.0001584', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '770.2', 'tokens/total': 14041088, 'tokens/trainable': 13902177, 'epoch': '2.07'}
 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1714/5680 [4:28:14<12:38:55, 11.48s/it] 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1715/5680 [4:28:26<12:48:44, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.4651', 'grad_norm': '0.26', 'learning_rate': '0.0001583', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '683.5', 'tokens/total': 14049280, 'tokens/trainable': 13910358, 'epoch': '2.07'}
 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1715/5680 [4:28:26<12:48:44, 11.63s/it] 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1716/5680 [4:28:39<13:07:57, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7687', 'grad_norm': '0.2675', 'learning_rate': '0.0001583', 'ppl': '2.157', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.5', 'tokens/total': 14057472, 'tokens/trainable': 13918538, 'epoch': '2.07'}
 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1716/5680 [4:28:39<13:07:57, 11.93s/it] 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1717/5680 [4:28:49<12:33:46, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.7453', 'grad_norm': '0.3259', 'learning_rate': '0.0001582', 'ppl': '2.107', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '794.9', 'tokens/total': 14065664, 'tokens/trainable': 13926649, 'epoch': '2.07'}
 30%|█████████████████████████████████████████████████████████▋                                                                                                                                     | 1717/5680 [4:28:49<12:33:46, 11.41s/it] 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1718/5680 [4:29:01<12:51:57, 11.69s/it]                                                                                                                                                                                                                                             {'loss': '0.7382', 'grad_norm': '0.3156', 'learning_rate': '0.0001582', 'ppl': '2.092', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '660.4', 'tokens/total': 14073856, 'tokens/trainable': 13934791, 'epoch': '2.07'}
 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1718/5680 [4:29:01<12:51:57, 11.69s/it] 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1719/5680 [4:29:14<13:04:18, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6651', 'grad_norm': '0.291', 'learning_rate': '0.0001582', 'ppl': '1.945', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '660.1', 'tokens/total': 14082048, 'tokens/trainable': 13942922, 'epoch': '2.07'}
 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1719/5680 [4:29:14<13:04:18, 11.88s/it] 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1720/5680 [4:29:24<12:32:09, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.7897', 'grad_norm': '0.3134', 'learning_rate': '0.0001581', 'ppl': '2.203', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '791.5', 'tokens/total': 14090240, 'tokens/trainable': 13951038, 'epoch': '2.071'}
 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1720/5680 [4:29:24<12:32:09, 11.40s/it] 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1721/5680 [4:29:36<12:53:43, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.6178', 'grad_norm': '0.3387', 'learning_rate': '0.0001581', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.5', 'tokens/total': 14098432, 'tokens/trainable': 13959149, 'epoch': '2.071'}
 30%|█████████████████████████████████████████████████████████▊                                                                                                                                     | 1721/5680 [4:29:36<12:53:43, 11.73s/it] 30%|█████████████████████████████████████████████████████████▉                                                                                                                                     | 1722/5680 [4:29:48<12:56:33, 11.77s/it]                                                                                                                                                                                                                                             {'loss': '0.6504', 'grad_norm': '0.2474', 'learning_rate': '0.000158', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '686.3', 'tokens/total': 14106624, 'tokens/trainable': 13967300, 'epoch': '2.071'}
 30%|█████████████████████████████████████████████████████████▉                                                                                                                                     | 1722/5680 [4:29:48<12:56:33, 11.77s/it] 30%|█████████████████████████████████████████████████████████▉                                                                                                                                     | 1723/5680 [4:29:59<12:34:38, 11.44s/it]                                                                                                                                                                                                                                             {'loss': '0.4964', 'grad_norm': '0.293', 'learning_rate': '0.000158', 'ppl': '1.643', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '766.2', 'tokens/total': 14114816, 'tokens/trainable': 13975468, 'epoch': '2.071'}
 30%|█████████████████████████████████████████████████████████▉                                                                                                                                     | 1723/5680 [4:29:59<12:34:38, 11.44s/it] 30%|█████████████████████████████████████████████████████████▉                                                                                                                                     | 1724/5680 [4:30:11<12:54:25, 11.75s/it]                                                                                                                                                                                                                                             {'loss': '0.7276', 'grad_norm': '0.276', 'learning_rate': '0.0001579', 'ppl': '2.07', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.8', 'tokens/total': 14123008, 'tokens/trainable': 13983594, 'epoch': '2.071'}
 30%|█████████████████████████████████████████████████████████▉                                                                                                                                     | 1724/5680 [4:30:11<12:54:25, 11.75s/it] 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1725/5680 [4:30:23<12:50:06, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.723', 'grad_norm': '0.2726', 'learning_rate': '0.0001579', 'ppl': '2.061', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '709.7', 'tokens/total': 14131200, 'tokens/trainable': 13991775, 'epoch': '2.071'}
 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1725/5680 [4:30:23<12:50:06, 11.68s/it] 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1726/5680 [4:30:34<12:36:38, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.5147', 'grad_norm': '0.2716', 'learning_rate': '0.0001578', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '737.6', 'tokens/total': 14139392, 'tokens/trainable': 13999892, 'epoch': '2.072'}
 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1726/5680 [4:30:34<12:36:38, 11.48s/it] 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1727/5680 [4:30:47<12:58:28, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.9148', 'grad_norm': '0.3108', 'learning_rate': '0.0001578', 'ppl': '2.496', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650.5', 'tokens/total': 14147584, 'tokens/trainable': 14008081, 'epoch': '2.072'}
 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1727/5680 [4:30:47<12:58:28, 11.82s/it] 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1728/5680 [4:30:58<12:46:22, 11.64s/it]                                                                                                                                                                                                                                             {'loss': '0.7783', 'grad_norm': '0.3039', 'learning_rate': '0.0001577', 'ppl': '2.178', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '726.5', 'tokens/total': 14155776, 'tokens/trainable': 14016226, 'epoch': '2.072'}
 30%|██████████████████████████████████████████████████████████                                                                                                                                     | 1728/5680 [4:30:58<12:46:22, 11.64s/it] 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1729/5680 [4:31:09<12:40:37, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.615', 'grad_norm': '0.2378', 'learning_rate': '0.0001577', 'ppl': '1.85', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '710.8', 'tokens/total': 14163968, 'tokens/trainable': 14024289, 'epoch': '2.072'}
 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1729/5680 [4:31:09<12:40:37, 11.55s/it] 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1730/5680 [4:31:22<12:59:15, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6175', 'grad_norm': '0.2643', 'learning_rate': '0.0001577', 'ppl': '1.854', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649', 'tokens/total': 14172160, 'tokens/trainable': 14032399, 'epoch': '2.072'}
 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1730/5680 [4:31:22<12:59:15, 11.84s/it] 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1731/5680 [4:31:32<12:38:40, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.6122', 'grad_norm': '0.266', 'learning_rate': '0.0001576', 'ppl': '1.845', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '752.4', 'tokens/total': 14180352, 'tokens/trainable': 14040525, 'epoch': '2.073'}
 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1731/5680 [4:31:32<12:38:40, 11.53s/it] 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1732/5680 [4:31:44<12:41:11, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.6034', 'grad_norm': '0.2644', 'learning_rate': '0.0001576', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '702.2', 'tokens/total': 14188544, 'tokens/trainable': 14048709, 'epoch': '2.073'}
 30%|██████████████████████████████████████████████████████████▏                                                                                                                                    | 1732/5680 [4:31:44<12:41:11, 11.57s/it] 31%|██████████████████████████████████████████████████████████▎                                                                                                                                    | 1733/5680 [4:31:57<13:02:15, 11.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7087', 'grad_norm': '0.2877', 'learning_rate': '0.0001575', 'ppl': '2.031', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '642', 'tokens/total': 14196736, 'tokens/trainable': 14056822, 'epoch': '2.073'}
 31%|██████████████████████████████████████████████████████████▎                                                                                                                                    | 1733/5680 [4:31:57<13:02:15, 11.89s/it] 31%|██████████████████████████████████████████████████████████▎                                                                                                                                    | 1734/5680 [4:32:07<12:33:20, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.8126', 'grad_norm': '0.2974', 'learning_rate': '0.0001575', 'ppl': '2.254', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '776.9', 'tokens/total': 14204928, 'tokens/trainable': 14064926, 'epoch': '2.073'}
 31%|██████████████████████████████████████████████████████████▎                                                                                                                                    | 1734/5680 [4:32:07<12:33:20, 11.45s/it] 31%|██████████████████████████████████████████████████████████▎                                                                                                                                    | 1735/5680 [4:32:19<12:44:21, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.678', 'grad_norm': '0.2551', 'learning_rate': '0.0001574', 'ppl': '1.97', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '677.1', 'tokens/total': 14213120, 'tokens/trainable': 14073058, 'epoch': '2.073'}
 31%|██████████████████████████████████████████████████████████▎                                                                                                                                    | 1735/5680 [4:32:19<12:44:21, 11.63s/it] 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1736/5680 [4:32:32<13:03:36, 11.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9332', 'grad_norm': '0.2968', 'learning_rate': '0.0001574', 'ppl': '2.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.2', 'tokens/total': 14221312, 'tokens/trainable': 14081217, 'epoch': '2.073'}
 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1736/5680 [4:32:32<13:03:36, 11.92s/it] 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1737/5680 [4:32:42<12:33:18, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.7309', 'grad_norm': '0.2943', 'learning_rate': '0.0001573', 'ppl': '2.077', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '775.8', 'tokens/total': 14229504, 'tokens/trainable': 14089278, 'epoch': '2.074'}
 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1737/5680 [4:32:42<12:33:18, 11.46s/it] 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1738/5680 [4:32:54<12:46:16, 11.66s/it]                                                                                                                                                                                                                                             {'loss': '0.7115', 'grad_norm': '0.2517', 'learning_rate': '0.0001573', 'ppl': '2.037', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '670', 'tokens/total': 14237696, 'tokens/trainable': 14097401, 'epoch': '2.074'}
 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1738/5680 [4:32:54<12:46:16, 11.66s/it] 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1739/5680 [4:33:06<12:53:52, 11.78s/it]                                                                                                                                                                                                                                             {'loss': '0.4064', 'grad_norm': '0.2384', 'learning_rate': '0.0001572', 'ppl': '1.501', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '677', 'tokens/total': 14245888, 'tokens/trainable': 14105559, 'epoch': '2.074'}
 31%|██████████████████████████████████████████████████████████▍                                                                                                                                    | 1739/5680 [4:33:06<12:53:52, 11.78s/it] 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1740/5680 [4:33:17<12:28:04, 11.39s/it]                                                                                                                                                                                                                                             {'loss': '0.4514', 'grad_norm': '0.2719', 'learning_rate': '0.0001572', 'ppl': '1.57', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '771', 'tokens/total': 14254080, 'tokens/trainable': 14113628, 'epoch': '2.074'}
 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1740/5680 [4:33:17<12:28:04, 11.39s/it] 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1741/5680 [4:33:29<12:47:36, 11.69s/it]                                                                                                                                                                                                                                             {'loss': '0.6734', 'grad_norm': '0.2802', 'learning_rate': '0.0001572', 'ppl': '1.961', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '658.6', 'tokens/total': 14262272, 'tokens/trainable': 14121789, 'epoch': '2.074'}
 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1741/5680 [4:33:29<12:47:36, 11.69s/it] 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1742/5680 [4:33:41<12:44:40, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.7413', 'grad_norm': '0.2939', 'learning_rate': '0.0001571', 'ppl': '2.099', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '698.6', 'tokens/total': 14270464, 'tokens/trainable': 14129858, 'epoch': '2.074'}
 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1742/5680 [4:33:41<12:44:40, 11.65s/it] 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1743/5680 [4:33:52<12:31:26, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.6694', 'grad_norm': '0.3046', 'learning_rate': '0.0001571', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '743.1', 'tokens/total': 14278656, 'tokens/trainable': 14138013, 'epoch': '2.075'}
 31%|██████████████████████████████████████████████████████████▌                                                                                                                                    | 1743/5680 [4:33:52<12:31:26, 11.45s/it] 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1744/5680 [4:34:04<12:52:19, 11.77s/it]                                                                                                                                                                                                                                             {'loss': '0.664', 'grad_norm': '0.2707', 'learning_rate': '0.000157', 'ppl': '1.943', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '646.4', 'tokens/total': 14286848, 'tokens/trainable': 14146104, 'epoch': '2.075'}
 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1744/5680 [4:34:04<12:52:19, 11.77s/it] 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1745/5680 [4:34:16<12:47:41, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.7581', 'grad_norm': '0.3572', 'learning_rate': '0.000157', 'ppl': '2.134', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '705', 'tokens/total': 14295040, 'tokens/trainable': 14154240, 'epoch': '2.075'}
 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1745/5680 [4:34:16<12:47:41, 11.71s/it] 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1746/5680 [4:34:27<12:39:14, 11.58s/it]                                                                                                                                                                                                                                             {'loss': '0.4868', 'grad_norm': '0.2846', 'learning_rate': '0.0001569', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '725.8', 'tokens/total': 14303232, 'tokens/trainable': 14162423, 'epoch': '2.075'}
 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1746/5680 [4:34:27<12:39:14, 11.58s/it] 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1747/5680 [4:34:40<12:59:09, 11.89s/it]                                                                                                                                                                                                                                             {'loss': '1.069', 'grad_norm': '0.3207', 'learning_rate': '0.0001569', 'ppl': '2.914', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.9', 'tokens/total': 14311424, 'tokens/trainable': 14170561, 'epoch': '2.075'}
 31%|██████████████████████████████████████████████████████████▋                                                                                                                                    | 1747/5680 [4:34:40<12:59:09, 11.89s/it] 31%|██████████████████████████████████████████████████████████▊                                                                                                                                    | 1748/5680 [4:34:51<12:41:56, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.7682', 'grad_norm': '0.3063', 'learning_rate': '0.0001568', 'ppl': '2.156', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '743.1', 'tokens/total': 14319616, 'tokens/trainable': 14178746, 'epoch': '2.076'}
 31%|██████████████████████████████████████████████████████████▊                                                                                                                                    | 1748/5680 [4:34:51<12:41:56, 11.63s/it] 31%|██████████████████████████████████████████████████████████▊                                                                                                                                    | 1749/5680 [4:35:02<12:41:34, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.4876', 'grad_norm': '0.2617', 'learning_rate': '0.0001568', 'ppl': '1.628', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '702.5', 'tokens/total': 14327808, 'tokens/trainable': 14186900, 'epoch': '2.076'}
 31%|██████████████████████████████████████████████████████████▊                                                                                                                                    | 1749/5680 [4:35:02<12:41:34, 11.62s/it] 31%|██████████████████████████████████████████████████████████▊                                                                                                                                    | 1750/5680 [4:35:15<12:58:10, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4905', 'grad_norm': '0.2518', 'learning_rate': '0.0001567', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.8', 'tokens/total': 14336000, 'tokens/trainable': 14195054, 'epoch': '2.076'}
 31%|██████████████████████████████████████████████████████████▊                                                                                                                                    | 1750/5680 [4:35:15<12:58:10, 11.88s/it] 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1751/5680 [4:35:26<12:35:26, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.6727', 'grad_norm': '0.3143', 'learning_rate': '0.0001567', 'ppl': '1.96', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '753.6', 'tokens/total': 14344192, 'tokens/trainable': 14203136, 'epoch': '2.076'}
 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1751/5680 [4:35:26<12:35:26, 11.54s/it] 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1752/5680 [4:35:37<12:40:22, 11.61s/it]                                                                                                                                                                                                                                             {'loss': '0.5444', 'grad_norm': '0.253', 'learning_rate': '0.0001567', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '687.1', 'tokens/total': 14352384, 'tokens/trainable': 14211234, 'epoch': '2.076'}
 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1752/5680 [4:35:37<12:40:22, 11.61s/it] 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1753/5680 [4:35:50<12:58:56, 11.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5332', 'grad_norm': '0.2492', 'learning_rate': '0.0001566', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.2', 'tokens/total': 14360576, 'tokens/trainable': 14219389, 'epoch': '2.076'}
 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1753/5680 [4:35:50<12:58:56, 11.90s/it] 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1754/5680 [4:36:00<12:25:54, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.5903', 'grad_norm': '0.2797', 'learning_rate': '0.0001566', 'ppl': '1.804', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '796.8', 'tokens/total': 14368768, 'tokens/trainable': 14227534, 'epoch': '2.077'}
 31%|██████████████████████████████████████████████████████████▉                                                                                                                                    | 1754/5680 [4:36:00<12:25:54, 11.40s/it] 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1755/5680 [4:36:12<12:42:02, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.7893', 'grad_norm': '0.281', 'learning_rate': '0.0001565', 'ppl': '2.202', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '669.2', 'tokens/total': 14376960, 'tokens/trainable': 14235714, 'epoch': '2.077'}
 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1755/5680 [4:36:12<12:42:02, 11.65s/it] 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1756/5680 [4:36:25<12:53:40, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.9061', 'grad_norm': '0.3428', 'learning_rate': '0.0001565', 'ppl': '2.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '666.4', 'tokens/total': 14385152, 'tokens/trainable': 14243874, 'epoch': '2.077'}
 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1756/5680 [4:36:25<12:53:40, 11.83s/it] 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1757/5680 [4:36:35<12:25:23, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.8038', 'grad_norm': '0.3156', 'learning_rate': '0.0001564', 'ppl': '2.234', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '783', 'tokens/total': 14393344, 'tokens/trainable': 14252014, 'epoch': '2.077'}
 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1757/5680 [4:36:35<12:25:23, 11.40s/it] 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1758/5680 [4:36:48<12:45:52, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.717', 'grad_norm': '0.3051', 'learning_rate': '0.0001564', 'ppl': '2.048', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.1', 'tokens/total': 14401536, 'tokens/trainable': 14260120, 'epoch': '2.077'}
 31%|███████████████████████████████████████████████████████████                                                                                                                                    | 1758/5680 [4:36:48<12:45:52, 11.72s/it] 31%|███████████████████████████████████████████████████████████▏                                                                                                                                   | 1759/5680 [4:36:59<12:47:33, 11.75s/it]                                                                                                                                                                                                                                             {'loss': '0.5412', 'grad_norm': '0.2712', 'learning_rate': '0.0001563', 'ppl': '1.718', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '692.8', 'tokens/total': 14409728, 'tokens/trainable': 14268301, 'epoch': '2.077'}
 31%|███████████████████████████████████████████████████████████▏                                                                                                                                   | 1759/5680 [4:36:59<12:47:33, 11.75s/it] 31%|███████████████████████████████████████████████████████████▏                                                                                                                                   | 1760/5680 [4:37:10<12:28:43, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.7411', 'grad_norm': '0.3112', 'learning_rate': '0.0001563', 'ppl': '2.098', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '757.8', 'tokens/total': 14417920, 'tokens/trainable': 14276464, 'epoch': '2.078'}
 31%|███████████████████████████████████████████████████████████▏                                                                                                                                   | 1760/5680 [4:37:10<12:28:43, 11.46s/it] 31%|███████████████████████████████████████████████████████████▏                                                                                                                                   | 1761/5680 [4:37:23<12:47:28, 11.75s/it]                                                                                                                                                                                                                                             {'loss': '0.5565', 'grad_norm': '0.2526', 'learning_rate': '0.0001562', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655.8', 'tokens/total': 14426112, 'tokens/trainable': 14284611, 'epoch': '2.078'}
 31%|███████████████████████████████████████████████████████████▏                                                                                                                                   | 1761/5680 [4:37:23<12:47:28, 11.75s/it] 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1762/5680 [4:37:34<12:39:03, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.6109', 'grad_norm': '0.31', 'learning_rate': '0.0001562', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '722.1', 'tokens/total': 14434304, 'tokens/trainable': 14292787, 'epoch': '2.078'}
 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1762/5680 [4:37:34<12:39:03, 11.62s/it] 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1763/5680 [4:37:45<12:31:03, 11.50s/it]                                                                                                                                                                                                                                             {'loss': '0.4724', 'grad_norm': '0.257', 'learning_rate': '0.0001562', 'ppl': '1.604', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '723.4', 'tokens/total': 14442496, 'tokens/trainable': 14300900, 'epoch': '2.078'}
 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1763/5680 [4:37:45<12:31:03, 11.50s/it] 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1764/5680 [4:37:58<12:49:52, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.8778', 'grad_norm': '0.2936', 'learning_rate': '0.0001561', 'ppl': '2.406', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.1', 'tokens/total': 14450688, 'tokens/trainable': 14309044, 'epoch': '2.078'}
 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1764/5680 [4:37:58<12:49:52, 11.80s/it] 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1765/5680 [4:38:08<12:27:26, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.5853', 'grad_norm': '0.2594', 'learning_rate': '0.0001561', 'ppl': '1.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '762.4', 'tokens/total': 14458880, 'tokens/trainable': 14317167, 'epoch': '2.079'}
 31%|███████████████████████████████████████████████████████████▎                                                                                                                                   | 1765/5680 [4:38:08<12:27:26, 11.45s/it] 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1766/5680 [4:38:20<12:33:42, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.6065', 'grad_norm': '0.2591', 'learning_rate': '0.000156', 'ppl': '1.834', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '689.7', 'tokens/total': 14467072, 'tokens/trainable': 14325282, 'epoch': '2.079'}
 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1766/5680 [4:38:20<12:33:42, 11.55s/it] 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1767/5680 [4:38:33<12:52:04, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6386', 'grad_norm': '0.278', 'learning_rate': '0.000156', 'ppl': '1.894', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.2', 'tokens/total': 14475264, 'tokens/trainable': 14333394, 'epoch': '2.079'}
 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1767/5680 [4:38:33<12:52:04, 11.84s/it] 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1768/5680 [4:38:43<12:19:15, 11.34s/it]                                                                                                                                                                                                                                             {'loss': '0.8376', 'grad_norm': '0.2749', 'learning_rate': '0.0001559', 'ppl': '2.311', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '794.9', 'tokens/total': 14483456, 'tokens/trainable': 14341477, 'epoch': '2.079'}
 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1768/5680 [4:38:43<12:19:15, 11.34s/it] 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1769/5680 [4:38:55<12:36:21, 11.60s/it]                                                                                                                                                                                                                                             {'loss': '0.9559', 'grad_norm': '0.2856', 'learning_rate': '0.0001559', 'ppl': '2.601', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '666.7', 'tokens/total': 14491648, 'tokens/trainable': 14349618, 'epoch': '2.079'}
 31%|███████████████████████████████████████████████████████████▍                                                                                                                                   | 1769/5680 [4:38:55<12:36:21, 11.60s/it] 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1770/5680 [4:39:07<12:47:20, 11.77s/it]                                                                                                                                                                                                                                             {'loss': '0.6487', 'grad_norm': '0.2685', 'learning_rate': '0.0001558', 'ppl': '1.913', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '670', 'tokens/total': 14499840, 'tokens/trainable': 14357771, 'epoch': '2.079'}
 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1770/5680 [4:39:07<12:47:20, 11.77s/it] 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1771/5680 [4:39:18<12:22:54, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.928', 'grad_norm': '0.3609', 'learning_rate': '0.0001558', 'ppl': '2.53', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '776.3', 'tokens/total': 14508032, 'tokens/trainable': 14365943, 'epoch': '2.08'}
 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1771/5680 [4:39:18<12:22:54, 11.40s/it] 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1772/5680 [4:39:30<12:42:27, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.2393', 'learning_rate': '0.0001557', 'ppl': '1.666', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '657.5', 'tokens/total': 14516224, 'tokens/trainable': 14374100, 'epoch': '2.08'}
 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1772/5680 [4:39:30<12:42:27, 11.71s/it] 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1773/5680 [4:39:42<12:45:05, 11.75s/it]                                                                                                                                                                                                                                             {'loss': '0.5298', 'grad_norm': '0.2435', 'learning_rate': '0.0001557', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '688.4', 'tokens/total': 14524416, 'tokens/trainable': 14382252, 'epoch': '2.08'}
 31%|███████████████████████████████████████████████████████████▌                                                                                                                                   | 1773/5680 [4:39:42<12:45:05, 11.75s/it] 31%|███████████████████████████████████████████████████████████▋                                                                                                                                   | 1774/5680 [4:39:53<12:26:21, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.4984', 'grad_norm': '0.25', 'learning_rate': '0.0001556', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '751.8', 'tokens/total': 14532608, 'tokens/trainable': 14390369, 'epoch': '2.08'}
 31%|███████████████████████████████████████████████████████████▋                                                                                                                                   | 1774/5680 [4:39:53<12:26:21, 11.46s/it] 31%|███████████████████████████████████████████████████████████▋                                                                                                                                   | 1775/5680 [4:40:05<12:47:03, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.3891', 'grad_norm': '0.2471', 'learning_rate': '0.0001556', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.7', 'tokens/total': 14540800, 'tokens/trainable': 14398557, 'epoch': '2.08'}
 31%|███████████████████████████████████████████████████████████▋                                                                                                                                   | 1775/5680 [4:40:05<12:47:03, 11.79s/it] 31%|███████████████████████████████████████████████████████████▋                                                                                                                                   | 1776/5680 [4:40:17<12:41:36, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.6439', 'grad_norm': '0.2891', 'learning_rate': '0.0001556', 'ppl': '1.904', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '704.7', 'tokens/total': 14548992, 'tokens/trainable': 14406669, 'epoch': '2.08'}
 31%|███████████████████████████████████████████████████████████▋                                                                                                                                   | 1776/5680 [4:40:17<12:41:36, 11.71s/it] 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1777/5680 [4:40:28<12:28:37, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.5238', 'grad_norm': '0.2664', 'learning_rate': '0.0001555', 'ppl': '1.688', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '736.7', 'tokens/total': 14557184, 'tokens/trainable': 14414801, 'epoch': '2.081'}
 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1777/5680 [4:40:28<12:28:37, 11.51s/it] 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1778/5680 [4:40:40<12:48:07, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.842', 'grad_norm': '0.2966', 'learning_rate': '0.0001555', 'ppl': '2.321', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '646.9', 'tokens/total': 14565376, 'tokens/trainable': 14422895, 'epoch': '2.081'}
 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1778/5680 [4:40:40<12:48:07, 11.81s/it] 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1779/5680 [4:40:52<12:34:36, 11.61s/it]                                                                                                                                                                                                                                             {'loss': '0.626', 'grad_norm': '0.263', 'learning_rate': '0.0001554', 'ppl': '1.87', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '735.4', 'tokens/total': 14573568, 'tokens/trainable': 14431076, 'epoch': '2.081'}
 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1779/5680 [4:40:52<12:34:36, 11.61s/it] 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1780/5680 [4:41:03<12:31:19, 11.56s/it]                                                                                                                                                                                                                                             {'loss': '0.6377', 'grad_norm': '0.2374', 'learning_rate': '0.0001554', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '708.5', 'tokens/total': 14581760, 'tokens/trainable': 14439182, 'epoch': '2.081'}
 31%|███████████████████████████████████████████████████████████▊                                                                                                                                   | 1780/5680 [4:41:03<12:31:19, 11.56s/it] 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1781/5680 [4:41:15<12:49:56, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6457', 'grad_norm': '0.2886', 'learning_rate': '0.0001553', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.5', 'tokens/total': 14589952, 'tokens/trainable': 14447338, 'epoch': '2.081'}
 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1781/5680 [4:41:15<12:49:56, 11.85s/it] 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1782/5680 [4:41:26<12:29:34, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '1.01', 'grad_norm': '0.3505', 'learning_rate': '0.0001553', 'ppl': '2.746', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '755.7', 'tokens/total': 14598144, 'tokens/trainable': 14455504, 'epoch': '2.082'}
 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1782/5680 [4:41:26<12:29:34, 11.54s/it] 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1783/5680 [4:41:38<12:32:54, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6543', 'grad_norm': '0.2667', 'learning_rate': '0.0001552', 'ppl': '1.924', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '698.4', 'tokens/total': 14606336, 'tokens/trainable': 14463680, 'epoch': '2.082'}
 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1783/5680 [4:41:38<12:32:54, 11.59s/it] 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1784/5680 [4:41:51<12:52:16, 11.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6556', 'grad_norm': '0.3136', 'learning_rate': '0.0001552', 'ppl': '1.926', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.7', 'tokens/total': 14614528, 'tokens/trainable': 14471832, 'epoch': '2.082'}
 31%|███████████████████████████████████████████████████████████▉                                                                                                                                   | 1784/5680 [4:41:51<12:52:16, 11.89s/it] 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1785/5680 [4:42:01<12:23:02, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.7849', 'grad_norm': '0.3249', 'learning_rate': '0.0001551', 'ppl': '2.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '782.9', 'tokens/total': 14622720, 'tokens/trainable': 14479968, 'epoch': '2.082'}
 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1785/5680 [4:42:01<12:23:02, 11.45s/it] 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1786/5680 [4:42:13<12:35:59, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5582', 'grad_norm': '0.2713', 'learning_rate': '0.0001551', 'ppl': '1.747', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '674', 'tokens/total': 14630912, 'tokens/trainable': 14488112, 'epoch': '2.082'}
 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1786/5680 [4:42:13<12:35:59, 11.65s/it] 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1787/5680 [4:42:25<12:47:17, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4816', 'grad_norm': '0.2606', 'learning_rate': '0.0001551', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '668.7', 'tokens/total': 14639104, 'tokens/trainable': 14496294, 'epoch': '2.082'}
 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1787/5680 [4:42:25<12:47:17, 11.83s/it] 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1788/5680 [4:42:36<12:23:58, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.724', 'grad_norm': '0.3131', 'learning_rate': '0.000155', 'ppl': '2.063', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '768.8', 'tokens/total': 14647296, 'tokens/trainable': 14504453, 'epoch': '2.083'}
 31%|████████████████████████████████████████████████████████████                                                                                                                                   | 1788/5680 [4:42:36<12:23:58, 11.47s/it] 31%|████████████████████████████████████████████████████████████▏                                                                                                                                  | 1789/5680 [4:42:48<12:43:42, 11.78s/it]                                                                                                                                                                                                                                             {'loss': '0.5714', 'grad_norm': '0.2531', 'learning_rate': '0.000155', 'ppl': '1.771', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645', 'tokens/total': 14655488, 'tokens/trainable': 14512507, 'epoch': '2.083'}
 31%|████████████████████████████████████████████████████████████▏                                                                                                                                  | 1789/5680 [4:42:48<12:43:42, 11.78s/it] 32%|████████████████████████████████████████████████████████████▏                                                                                                                                  | 1790/5680 [4:43:00<12:46:59, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.6406', 'grad_norm': '0.2889', 'learning_rate': '0.0001549', 'ppl': '1.898', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '673.1', 'tokens/total': 14663680, 'tokens/trainable': 14520552, 'epoch': '2.083'}
 32%|████████████████████████████████████████████████████████████▏                                                                                                                                  | 1790/5680 [4:43:00<12:46:59, 11.83s/it] 32%|████████████████████████████████████████████████████████████▏                                                                                                                                  | 1791/5680 [4:43:11<12:23:28, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.8389', 'grad_norm': '0.3766', 'learning_rate': '0.0001549', 'ppl': '2.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '762.6', 'tokens/total': 14671872, 'tokens/trainable': 14528650, 'epoch': '2.083'}
 32%|████████████████████████████████████████████████████████████▏                                                                                                                                  | 1791/5680 [4:43:11<12:23:28, 11.47s/it] 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1792/5680 [4:43:24<12:43:58, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.4615', 'grad_norm': '0.2407', 'learning_rate': '0.0001548', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650.4', 'tokens/total': 14680064, 'tokens/trainable': 14536798, 'epoch': '2.083'}
 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1792/5680 [4:43:24<12:43:58, 11.79s/it] 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1793/5680 [4:43:35<12:41:53, 11.76s/it]                                                                                                                                                                                                                                             {'loss': '0.8258', 'grad_norm': '0.2941', 'learning_rate': '0.0001548', 'ppl': '2.284', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '700.5', 'tokens/total': 14688256, 'tokens/trainable': 14544982, 'epoch': '2.083'}
 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1793/5680 [4:43:35<12:41:53, 11.76s/it] 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1794/5680 [4:43:46<12:26:12, 11.52s/it]                                                                                                                                                                                                                                             {'loss': '0.5054', 'grad_norm': '0.2882', 'learning_rate': '0.0001547', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '736.6', 'tokens/total': 14696448, 'tokens/trainable': 14553048, 'epoch': '2.084'}
 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1794/5680 [4:43:46<12:26:12, 11.52s/it] 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1795/5680 [4:43:59<12:44:40, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.5008', 'grad_norm': '0.251', 'learning_rate': '0.0001547', 'ppl': '1.65', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '648', 'tokens/total': 14704640, 'tokens/trainable': 14561130, 'epoch': '2.084'}
 32%|████████████████████████████████████████████████████████████▎                                                                                                                                  | 1795/5680 [4:43:59<12:44:40, 11.81s/it] 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1796/5680 [4:44:10<12:30:10, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6114', 'grad_norm': '0.2671', 'learning_rate': '0.0001546', 'ppl': '1.843', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '729.6', 'tokens/total': 14712832, 'tokens/trainable': 14569203, 'epoch': '2.084'}
 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1796/5680 [4:44:10<12:30:10, 11.59s/it] 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1797/5680 [4:44:21<12:26:27, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.6175', 'grad_norm': '0.2964', 'learning_rate': '0.0001546', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '712.5', 'tokens/total': 14721024, 'tokens/trainable': 14577324, 'epoch': '2.084'}
 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1797/5680 [4:44:21<12:26:27, 11.53s/it] 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1798/5680 [4:44:34<12:45:08, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.6065', 'grad_norm': '0.3211', 'learning_rate': '0.0001545', 'ppl': '1.834', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645', 'tokens/total': 14729216, 'tokens/trainable': 14585389, 'epoch': '2.084'}
 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1798/5680 [4:44:34<12:45:08, 11.83s/it] 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1799/5680 [4:44:44<12:21:04, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.733', 'grad_norm': '0.3568', 'learning_rate': '0.0001545', 'ppl': '2.081', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '770.9', 'tokens/total': 14737408, 'tokens/trainable': 14593554, 'epoch': '2.085'}
 32%|████████████████████████████████████████████████████████████▍                                                                                                                                  | 1799/5680 [4:44:44<12:21:04, 11.46s/it] 32%|████████████████████████████████████████████████████████████▌                                                                                                                                  | 1800/5680 [4:44:56<12:28:31, 11.58s/it]                                                                                                                                                                                                                                             {'loss': '0.6565', 'grad_norm': '0.2926', 'learning_rate': '0.0001544', 'ppl': '1.928', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '686', 'tokens/total': 14745600, 'tokens/trainable': 14601675, 'epoch': '2.085'}
 32%|████████████████████████████████████████████████████████████▌                                                                                                                                  | 1800/5680 [4:44:56<12:28:31, 11.58s/it] 32%|████████████████████████████████████████████████████████████▌                                                                                                                                  | 1801/5680 [4:45:09<12:46:33, 11.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6533', 'grad_norm': '0.2682', 'learning_rate': '0.0001544', 'ppl': '1.922', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '654.1', 'tokens/total': 14753792, 'tokens/trainable': 14609859, 'epoch': '2.085'}
 32%|████████████████████████████████████████████████████████████▌                                                                                                                                  | 1801/5680 [4:45:09<12:46:33, 11.86s/it] 32%|████████████████████████████████████████████████████████████▌                                                                                                                                  | 1802/5680 [4:45:19<12:17:41, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.749', 'grad_norm': '0.2803', 'learning_rate': '0.0001544', 'ppl': '2.115', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '777.1', 'tokens/total': 14761984, 'tokens/trainable': 14617918, 'epoch': '2.085'}
 32%|████████████████████████████████████████████████████████████▌                                                                                                                                  | 1802/5680 [4:45:19<12:17:41, 11.41s/it] 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1803/5680 [4:45:31<12:31:09, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.876', 'grad_norm': '0.2856', 'learning_rate': '0.0001543', 'ppl': '2.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '672.6', 'tokens/total': 14770176, 'tokens/trainable': 14626062, 'epoch': '2.085'}
 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1803/5680 [4:45:31<12:31:09, 11.62s/it] 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1804/5680 [4:45:44<12:44:31, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.9304', 'grad_norm': '0.281', 'learning_rate': '0.0001543', 'ppl': '2.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '664.3', 'tokens/total': 14778368, 'tokens/trainable': 14634243, 'epoch': '2.085'}
 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1804/5680 [4:45:44<12:44:31, 11.83s/it] 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1805/5680 [4:45:54<12:13:39, 11.36s/it]                                                                                                                                                                                                                                             {'loss': '0.7565', 'grad_norm': '0.2973', 'learning_rate': '0.0001542', 'ppl': '2.131', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '796.3', 'tokens/total': 14786560, 'tokens/trainable': 14642398, 'epoch': '2.086'}
 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1805/5680 [4:45:54<12:13:39, 11.36s/it] 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1806/5680 [4:46:06<12:34:02, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.7576', 'grad_norm': '0.2876', 'learning_rate': '0.0001542', 'ppl': '2.133', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '658.7', 'tokens/total': 14794752, 'tokens/trainable': 14650578, 'epoch': '2.086'}
 32%|████████████████████████████████████████████████████████████▋                                                                                                                                  | 1806/5680 [4:46:06<12:34:02, 11.68s/it] 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1807/5680 [4:46:18<12:37:07, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.7565', 'grad_norm': '0.3207', 'learning_rate': '0.0001541', 'ppl': '2.131', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '686.1', 'tokens/total': 14802944, 'tokens/trainable': 14658703, 'epoch': '2.086'}
 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1807/5680 [4:46:18<12:37:07, 11.73s/it] 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1808/5680 [4:46:29<12:19:57, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.3582', 'grad_norm': '0.2138', 'learning_rate': '0.0001541', 'ppl': '1.431', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '745.2', 'tokens/total': 14811136, 'tokens/trainable': 14666788, 'epoch': '2.086'}
 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1808/5680 [4:46:29<12:19:57, 11.47s/it] 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1809/5680 [4:46:41<12:38:55, 11.76s/it]                                                                                                                                                                                                                                             {'loss': '0.5297', 'grad_norm': '0.2442', 'learning_rate': '0.000154', 'ppl': '1.698', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650.3', 'tokens/total': 14819328, 'tokens/trainable': 14674883, 'epoch': '2.086'}
 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1809/5680 [4:46:41<12:38:55, 11.76s/it] 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1810/5680 [4:46:53<12:28:25, 11.60s/it]                                                                                                                                                                                                                                             {'loss': '0.8647', 'grad_norm': '0.3177', 'learning_rate': '0.000154', 'ppl': '2.374', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '721.4', 'tokens/total': 14827520, 'tokens/trainable': 14682980, 'epoch': '2.086'}
 32%|████████████████████████████████████████████████████████████▊                                                                                                                                  | 1810/5680 [4:46:53<12:28:25, 11.60s/it] 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1811/5680 [4:47:04<12:22:15, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.769', 'grad_norm': '0.2905', 'learning_rate': '0.0001539', 'ppl': '2.158', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.6', 'tokens/total': 14835712, 'tokens/trainable': 14691055, 'epoch': '2.087'}
 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1811/5680 [4:47:04<12:22:15, 11.51s/it] 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1812/5680 [4:47:16<12:41:21, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.5776', 'grad_norm': '0.2823', 'learning_rate': '0.0001539', 'ppl': '1.782', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.7', 'tokens/total': 14843904, 'tokens/trainable': 14699202, 'epoch': '2.087'}
 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1812/5680 [4:47:16<12:41:21, 11.81s/it] 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1813/5680 [4:47:27<12:22:39, 11.52s/it]                                                                                                                                                                                                                                             {'loss': '0.7713', 'grad_norm': '0.2603', 'learning_rate': '0.0001538', 'ppl': '2.163', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '746.9', 'tokens/total': 14852096, 'tokens/trainable': 14707305, 'epoch': '2.087'}
 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1813/5680 [4:47:27<12:22:39, 11.52s/it] 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1814/5680 [4:47:39<12:23:39, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.6407', 'grad_norm': '0.2411', 'learning_rate': '0.0001538', 'ppl': '1.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '691.5', 'tokens/total': 14860288, 'tokens/trainable': 14715306, 'epoch': '2.087'}
 32%|████████████████████████████████████████████████████████████▉                                                                                                                                  | 1814/5680 [4:47:39<12:23:39, 11.54s/it] 32%|█████████████████████████████████████████████████████████████                                                                                                                                  | 1815/5680 [4:47:51<12:42:47, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.8199', 'grad_norm': '0.3459', 'learning_rate': '0.0001538', 'ppl': '2.27', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.5', 'tokens/total': 14868480, 'tokens/trainable': 14723385, 'epoch': '2.087'}
 32%|█████████████████████████████████████████████████████████████                                                                                                                                  | 1815/5680 [4:47:51<12:42:47, 11.84s/it] 32%|█████████████████████████████████████████████████████████████                                                                                                                                  | 1816/5680 [4:48:02<12:18:57, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.6345', 'grad_norm': '0.2739', 'learning_rate': '0.0001537', 'ppl': '1.886', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '766.2', 'tokens/total': 14876672, 'tokens/trainable': 14731518, 'epoch': '2.087'}
 32%|█████████████████████████████████████████████████████████████                                                                                                                                  | 1816/5680 [4:48:02<12:18:57, 11.47s/it] 32%|█████████████████████████████████████████████████████████████                                                                                                                                  | 1817/5680 [4:48:14<12:28:53, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.5498', 'grad_norm': '0.285', 'learning_rate': '0.0001537', 'ppl': '1.733', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '674.9', 'tokens/total': 14884864, 'tokens/trainable': 14739606, 'epoch': '2.088'}
 32%|█████████████████████████████████████████████████████████████                                                                                                                                  | 1817/5680 [4:48:14<12:28:53, 11.63s/it] 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1818/5680 [4:48:26<12:44:52, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6076', 'grad_norm': '0.3003', 'learning_rate': '0.0001536', 'ppl': '1.836', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.8', 'tokens/total': 14893056, 'tokens/trainable': 14747757, 'epoch': '2.088'}
 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1818/5680 [4:48:26<12:44:52, 11.88s/it] 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1819/5680 [4:48:37<12:10:29, 11.35s/it]                                                                                                                                                                                                                                             {'loss': '0.4127', 'grad_norm': '0.2357', 'learning_rate': '0.0001536', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '807.8', 'tokens/total': 14901248, 'tokens/trainable': 14755924, 'epoch': '2.088'}
 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1819/5680 [4:48:37<12:10:29, 11.35s/it] 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1820/5680 [4:48:49<12:31:39, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.6297', 'grad_norm': '0.2962', 'learning_rate': '0.0001535', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651', 'tokens/total': 14909440, 'tokens/trainable': 14764032, 'epoch': '2.088'}
 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1820/5680 [4:48:49<12:31:39, 11.68s/it] 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1821/5680 [4:49:01<12:39:24, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.3817', 'grad_norm': '0.2978', 'learning_rate': '0.0001535', 'ppl': '1.465', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '675.3', 'tokens/total': 14917632, 'tokens/trainable': 14772199, 'epoch': '2.088'}
 32%|█████████████████████████████████████████████████████████████▏                                                                                                                                 | 1821/5680 [4:49:01<12:39:24, 11.81s/it] 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1822/5680 [4:49:12<12:16:10, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.5596', 'grad_norm': '0.2784', 'learning_rate': '0.0001534', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '757.1', 'tokens/total': 14925824, 'tokens/trainable': 14780223, 'epoch': '2.089'}
 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1822/5680 [4:49:12<12:16:10, 11.45s/it] 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1823/5680 [4:49:24<12:34:26, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.5718', 'grad_norm': '0.279', 'learning_rate': '0.0001534', 'ppl': '1.771', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '654.8', 'tokens/total': 14934016, 'tokens/trainable': 14788343, 'epoch': '2.089'}
 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1823/5680 [4:49:24<12:34:26, 11.74s/it] 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1824/5680 [4:49:36<12:33:09, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.6386', 'grad_norm': '0.2682', 'learning_rate': '0.0001533', 'ppl': '1.894', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '682.5', 'tokens/total': 14942208, 'tokens/trainable': 14796310, 'epoch': '2.089'}
 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1824/5680 [4:49:36<12:33:09, 11.72s/it] 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1825/5680 [4:49:47<12:25:18, 11.60s/it]                                                                                                                                                                                                                                             {'loss': '0.7161', 'grad_norm': '0.2545', 'learning_rate': '0.0001533', 'ppl': '2.046', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '720.1', 'tokens/total': 14950400, 'tokens/trainable': 14804455, 'epoch': '2.089'}
 32%|█████████████████████████████████████████████████████████████▎                                                                                                                                 | 1825/5680 [4:49:47<12:25:18, 11.60s/it] 32%|█████████████████████████████████████████████████████████████▍                                                                                                                                 | 1826/5680 [4:50:00<12:50:13, 11.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3919', 'grad_norm': '0.2251', 'learning_rate': '0.0001532', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.7', 'tokens/total': 14958592, 'tokens/trainable': 14812487, 'epoch': '2.089'}
 32%|█████████████████████████████████████████████████████████████▍                                                                                                                                 | 1826/5680 [4:50:00<12:50:13, 11.99s/it] 32%|█████████████████████████████████████████████████████████████▍                                                                                                                                 | 1827/5680 [4:50:12<12:42:23, 11.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5043', 'grad_norm': '0.2512', 'learning_rate': '0.0001532', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.1', 'tokens/total': 14966784, 'tokens/trainable': 14820615, 'epoch': '2.089'}
 32%|█████████████████████████████████████████████████████████████▍                                                                                                                                 | 1827/5680 [4:50:12<12:42:23, 11.87s/it] 32%|█████████████████████████████████████████████████████████████▍                                                                                                                                 | 1828/5680 [4:50:23<12:28:59, 11.67s/it]                                                                                                                                                                                                                                             {'loss': '0.5879', 'grad_norm': '0.2996', 'learning_rate': '0.0001531', 'ppl': '1.8', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '718.8', 'tokens/total': 14974976, 'tokens/trainable': 14828648, 'epoch': '2.09'}
 32%|█████████████████████████████████████████████████████████████▍                                                                                                                                 | 1828/5680 [4:50:23<12:28:59, 11.67s/it] 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1829/5680 [4:50:36<12:49:10, 11.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5772', 'grad_norm': '0.2997', 'learning_rate': '0.0001531', 'ppl': '1.781', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.2', 'tokens/total': 14983168, 'tokens/trainable': 14836715, 'epoch': '2.09'}
 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1829/5680 [4:50:36<12:49:10, 11.98s/it] 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1830/5680 [4:50:47<12:38:27, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.7658', 'grad_norm': '0.3336', 'learning_rate': '0.000153', 'ppl': '2.151', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.8', 'tokens/total': 14991360, 'tokens/trainable': 14844737, 'epoch': '2.09'}
 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1830/5680 [4:50:47<12:38:27, 11.82s/it] 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1831/5680 [4:50:58<12:32:05, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.6135', 'grad_norm': '0.2653', 'learning_rate': '0.000153', 'ppl': '1.847', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '704.1', 'tokens/total': 14999552, 'tokens/trainable': 14852829, 'epoch': '2.09'}
 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1831/5680 [4:50:58<12:32:05, 11.72s/it] 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1832/5680 [4:51:11<12:47:36, 11.97s/it]                                                                                                                                                                                                                                             {'loss': '0.68', 'grad_norm': '0.3082', 'learning_rate': '0.000153', 'ppl': '1.974', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.9', 'tokens/total': 15007744, 'tokens/trainable': 14860789, 'epoch': '2.09'}
 32%|█████████████████████████████████████████████████████████████▌                                                                                                                                 | 1832/5680 [4:51:11<12:47:36, 11.97s/it] 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1833/5680 [4:51:22<12:35:16, 11.78s/it]                                                                                                                                                                                                                                             {'loss': '0.6978', 'grad_norm': '0.2825', 'learning_rate': '0.0001529', 'ppl': '2.009', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '710.9', 'tokens/total': 15015936, 'tokens/trainable': 14868846, 'epoch': '2.09'}
 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1833/5680 [4:51:22<12:35:16, 11.78s/it] 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1834/5680 [4:51:34<12:26:45, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.4857', 'grad_norm': '0.245', 'learning_rate': '0.0001529', 'ppl': '1.625', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '716.8', 'tokens/total': 15024128, 'tokens/trainable': 14876967, 'epoch': '2.091'}
 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1834/5680 [4:51:34<12:26:45, 11.65s/it] 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1835/5680 [4:51:46<12:45:25, 11.94s/it]                                                                                                                                                                                                                                             {'loss': '0.841', 'grad_norm': '0.3157', 'learning_rate': '0.0001528', 'ppl': '2.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '641.4', 'tokens/total': 15032320, 'tokens/trainable': 14885067, 'epoch': '2.091'}
 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1835/5680 [4:51:46<12:45:25, 11.94s/it] 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1836/5680 [4:51:58<12:45:07, 11.94s/it]                                                                                                                                                                                                                                             {'loss': '1.054', 'grad_norm': '0.442', 'learning_rate': '0.0001528', 'ppl': '2.87', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.5', 'tokens/total': 15040512, 'tokens/trainable': 14893161, 'epoch': '2.091'}
 32%|█████████████████████████████████████████████████████████████▋                                                                                                                                 | 1836/5680 [4:51:58<12:45:07, 11.94s/it] 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1837/5680 [4:52:09<12:30:03, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.6555', 'grad_norm': '0.3211', 'learning_rate': '0.0001527', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.6', 'tokens/total': 15048704, 'tokens/trainable': 14901128, 'epoch': '2.091'}
 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1837/5680 [4:52:09<12:30:03, 11.71s/it] 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1838/5680 [4:52:22<12:46:44, 11.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4948', 'grad_norm': '0.2288', 'learning_rate': '0.0001527', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.6', 'tokens/total': 15056896, 'tokens/trainable': 14909279, 'epoch': '2.091'}
 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1838/5680 [4:52:22<12:46:44, 11.97s/it] 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1839/5680 [4:52:33<12:32:16, 11.75s/it]                                                                                                                                                                                                                                             {'loss': '0.4524', 'grad_norm': '0.2736', 'learning_rate': '0.0001526', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '712.6', 'tokens/total': 15065088, 'tokens/trainable': 14917279, 'epoch': '2.092'}
 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1839/5680 [4:52:33<12:32:16, 11.75s/it] 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1840/5680 [4:52:45<12:28:54, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.5313', 'grad_norm': '0.2717', 'learning_rate': '0.0001526', 'ppl': '1.701', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '695.2', 'tokens/total': 15073280, 'tokens/trainable': 14925329, 'epoch': '2.092'}
 32%|█████████████████████████████████████████████████████████████▊                                                                                                                                 | 1840/5680 [4:52:45<12:28:54, 11.70s/it] 32%|█████████████████████████████████████████████████████████████▉                                                                                                                                 | 1841/5680 [4:52:58<12:46:55, 11.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6407', 'grad_norm': '0.295', 'learning_rate': '0.0001525', 'ppl': '1.898', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '642.5', 'tokens/total': 15081472, 'tokens/trainable': 14933453, 'epoch': '2.092'}
 32%|█████████████████████████████████████████████████████████████▉                                                                                                                                 | 1841/5680 [4:52:58<12:46:55, 11.99s/it] 32%|█████████████████████████████████████████████████████████████▉                                                                                                                                 | 1842/5680 [4:53:08<12:22:30, 11.61s/it]                                                                                                                                                                                                                                             {'loss': '0.5709', 'grad_norm': '0.3009', 'learning_rate': '0.0001525', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '730.8', 'tokens/total': 15089664, 'tokens/trainable': 14941285, 'epoch': '2.092'}
 32%|█████████████████████████████████████████████████████████████▉                                                                                                                                 | 1842/5680 [4:53:08<12:22:30, 11.61s/it] 32%|█████████████████████████████████████████████████████████████▉                                                                                                                                 | 1843/5680 [4:53:20<12:26:42, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.6707', 'grad_norm': '0.2654', 'learning_rate': '0.0001524', 'ppl': '1.956', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '691.5', 'tokens/total': 15097856, 'tokens/trainable': 14949461, 'epoch': '2.092'}
 32%|█████████████████████████████████████████████████████████████▉                                                                                                                                 | 1843/5680 [4:53:20<12:26:42, 11.68s/it] 32%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1844/5680 [4:53:33<12:44:38, 11.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5524', 'grad_norm': '0.3131', 'learning_rate': '0.0001524', 'ppl': '1.737', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '623.8', 'tokens/total': 15106048, 'tokens/trainable': 14957332, 'epoch': '2.092'}
 32%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1844/5680 [4:53:33<12:44:38, 11.96s/it] 32%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1845/5680 [4:53:43<12:18:07, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.6845', 'grad_norm': '0.3158', 'learning_rate': '0.0001523', 'ppl': '1.983', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '752.4', 'tokens/total': 15114240, 'tokens/trainable': 14965292, 'epoch': '2.093'}
 32%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1845/5680 [4:53:43<12:18:07, 11.55s/it] 32%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1846/5680 [4:53:55<12:28:55, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.8228', 'grad_norm': '0.3015', 'learning_rate': '0.0001523', 'ppl': '2.277', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '657', 'tokens/total': 15122432, 'tokens/trainable': 14973249, 'epoch': '2.093'}
 32%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1846/5680 [4:53:55<12:28:55, 11.72s/it] 33%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1847/5680 [4:54:08<12:46:31, 12.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6151', 'grad_norm': '0.2887', 'learning_rate': '0.0001522', 'ppl': '1.85', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '641.6', 'tokens/total': 15130624, 'tokens/trainable': 14981358, 'epoch': '2.093'}
 33%|██████████████████████████████████████████████████████████████                                                                                                                                 | 1847/5680 [4:54:08<12:46:31, 12.00s/it] 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1848/5680 [4:54:18<12:12:26, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.5654', 'grad_norm': '0.2595', 'learning_rate': '0.0001522', 'ppl': '1.76', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '776.7', 'tokens/total': 15138816, 'tokens/trainable': 14989299, 'epoch': '2.093'}
 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1848/5680 [4:54:18<12:12:26, 11.47s/it] 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1849/5680 [4:54:31<12:27:13, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.5097', 'grad_norm': '0.2805', 'learning_rate': '0.0001522', 'ppl': '1.665', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '669.2', 'tokens/total': 15147008, 'tokens/trainable': 14997488, 'epoch': '2.093'}
 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1849/5680 [4:54:31<12:27:13, 11.70s/it] 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1850/5680 [4:54:43<12:36:46, 11.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5738', 'grad_norm': '0.3049', 'learning_rate': '0.0001521', 'ppl': '1.775', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '661.5', 'tokens/total': 15155200, 'tokens/trainable': 15005564, 'epoch': '2.093'}
 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1850/5680 [4:54:43<12:36:46, 11.86s/it] 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1851/5680 [4:54:53<12:09:33, 11.43s/it]                                                                                                                                                                                                                                             {'loss': '0.6084', 'grad_norm': '0.2734', 'learning_rate': '0.0001521', 'ppl': '1.838', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '770.7', 'tokens/total': 15163392, 'tokens/trainable': 15013603, 'epoch': '2.094'}
 33%|██████████████████████████████████████████████████████████████▏                                                                                                                                | 1851/5680 [4:54:53<12:09:33, 11.43s/it] 33%|██████████████████████████████████████████████████████████████▎                                                                                                                                | 1852/5680 [4:55:06<12:28:03, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.5067', 'grad_norm': '0.2394', 'learning_rate': '0.000152', 'ppl': '1.66', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '648.7', 'tokens/total': 15171584, 'tokens/trainable': 15021650, 'epoch': '2.094'}
 33%|██████████████████████████████████████████████████████████████▎                                                                                                                                | 1852/5680 [4:55:06<12:28:03, 11.72s/it] 33%|██████████████████████████████████████████████████████████████▎                                                                                                                                | 1853/5680 [4:55:17<12:28:41, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.7749', 'grad_norm': '0.2877', 'learning_rate': '0.000152', 'ppl': '2.17', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '690.7', 'tokens/total': 15179776, 'tokens/trainable': 15029777, 'epoch': '2.094'}
 33%|██████████████████████████████████████████████████████████████▎                                                                                                                                | 1853/5680 [4:55:17<12:28:41, 11.74s/it] 33%|██████████████████████████████████████████████████████████████▎                                                                                                                                | 1854/5680 [4:55:28<12:10:53, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.74', 'grad_norm': '0.2864', 'learning_rate': '0.0001519', 'ppl': '2.096', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '739', 'tokens/total': 15187968, 'tokens/trainable': 15037768, 'epoch': '2.094'}
 33%|██████████████████████████████████████████████████████████████▎                                                                                                                                | 1854/5680 [4:55:28<12:10:53, 11.46s/it] 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1855/5680 [4:55:41<12:31:16, 11.78s/it]                                                                                                                                                                                                                                             {'loss': '0.5341', 'grad_norm': '0.2994', 'learning_rate': '0.0001519', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.8', 'tokens/total': 15196160, 'tokens/trainable': 15045749, 'epoch': '2.094'}
 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1855/5680 [4:55:41<12:31:16, 11.78s/it] 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1856/5680 [4:55:52<12:24:31, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.5984', 'grad_norm': '0.3389', 'learning_rate': '0.0001518', 'ppl': '1.819', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.2', 'tokens/total': 15204352, 'tokens/trainable': 15053930, 'epoch': '2.095'}
 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1856/5680 [4:55:52<12:24:31, 11.68s/it] 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1857/5680 [4:56:03<12:15:15, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.7341', 'grad_norm': '0.3397', 'learning_rate': '0.0001518', 'ppl': '2.084', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '708.1', 'tokens/total': 15212544, 'tokens/trainable': 15061840, 'epoch': '2.095'}
 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1857/5680 [4:56:03<12:15:15, 11.54s/it] 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1858/5680 [4:56:16<12:34:34, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6145', 'grad_norm': '0.3428', 'learning_rate': '0.0001517', 'ppl': '1.849', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '639.1', 'tokens/total': 15220736, 'tokens/trainable': 15069864, 'epoch': '2.095'}
 33%|██████████████████████████████████████████████████████████████▍                                                                                                                                | 1858/5680 [4:56:16<12:34:34, 11.85s/it] 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1859/5680 [4:56:27<12:16:59, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.6358', 'grad_norm': '0.2556', 'learning_rate': '0.0001517', 'ppl': '1.889', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '727', 'tokens/total': 15228928, 'tokens/trainable': 15077811, 'epoch': '2.095'}
 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1859/5680 [4:56:27<12:16:59, 11.57s/it] 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1860/5680 [4:56:38<12:15:34, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.6391', 'grad_norm': '0.2634', 'learning_rate': '0.0001516', 'ppl': '1.895', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.8', 'tokens/total': 15237120, 'tokens/trainable': 15085880, 'epoch': '2.095'}
 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1860/5680 [4:56:38<12:15:34, 11.55s/it] 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1861/5680 [4:56:51<12:34:14, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7962', 'grad_norm': '0.2815', 'learning_rate': '0.0001516', 'ppl': '2.217', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.2', 'tokens/total': 15245312, 'tokens/trainable': 15093970, 'epoch': '2.095'}
 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1861/5680 [4:56:51<12:34:14, 11.85s/it] 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1862/5680 [4:57:02<12:10:18, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.5395', 'grad_norm': '0.2554', 'learning_rate': '0.0001515', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '763.2', 'tokens/total': 15253504, 'tokens/trainable': 15102063, 'epoch': '2.096'}
 33%|██████████████████████████████████████████████████████████████▌                                                                                                                                | 1862/5680 [4:57:02<12:10:18, 11.48s/it] 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1863/5680 [4:57:13<12:17:30, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6287', 'grad_norm': '0.2924', 'learning_rate': '0.0001515', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '662.3', 'tokens/total': 15261696, 'tokens/trainable': 15109907, 'epoch': '2.096'}
 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1863/5680 [4:57:13<12:17:30, 11.59s/it] 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1864/5680 [4:57:26<12:35:52, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7446', 'grad_norm': '0.2519', 'learning_rate': '0.0001514', 'ppl': '2.106', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '641.1', 'tokens/total': 15269888, 'tokens/trainable': 15117958, 'epoch': '2.096'}
 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1864/5680 [4:57:26<12:35:52, 11.88s/it] 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1865/5680 [4:57:36<12:03:41, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.5568', 'grad_norm': '0.2521', 'learning_rate': '0.0001514', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '779', 'tokens/total': 15278080, 'tokens/trainable': 15125904, 'epoch': '2.096'}
 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1865/5680 [4:57:36<12:03:41, 11.38s/it] 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1866/5680 [4:57:48<12:20:33, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5369', 'grad_norm': '0.2797', 'learning_rate': '0.0001514', 'ppl': '1.711', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655.2', 'tokens/total': 15286272, 'tokens/trainable': 15133939, 'epoch': '2.096'}
 33%|██████████████████████████████████████████████████████████████▋                                                                                                                                | 1866/5680 [4:57:48<12:20:33, 11.65s/it] 33%|██████████████████████████████████████████████████████████████▊                                                                                                                                | 1867/5680 [4:58:01<12:28:16, 11.77s/it]                                                                                                                                                                                                                                             {'loss': '0.8477', 'grad_norm': '0.3306', 'learning_rate': '0.0001513', 'ppl': '2.334', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '672.4', 'tokens/total': 15294464, 'tokens/trainable': 15142046, 'epoch': '2.096'}
 33%|██████████████████████████████████████████████████████████████▊                                                                                                                                | 1867/5680 [4:58:01<12:28:16, 11.77s/it] 33%|██████████████████████████████████████████████████████████████▊                                                                                                                                | 1868/5680 [4:58:11<12:04:48, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.408', 'grad_norm': '0.2646', 'learning_rate': '0.0001513', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '751.4', 'tokens/total': 15302656, 'tokens/trainable': 15149963, 'epoch': '2.097'}
 33%|██████████████████████████████████████████████████████████████▊                                                                                                                                | 1868/5680 [4:58:11<12:04:48, 11.41s/it] 33%|██████████████████████████████████████████████████████████████▊                                                                                                                                | 1869/5680 [4:58:23<12:23:40, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.7126', 'grad_norm': '0.2962', 'learning_rate': '0.0001512', 'ppl': '2.039', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '638.2', 'tokens/total': 15310848, 'tokens/trainable': 15157877, 'epoch': '2.097'}
 33%|██████████████████████████████████████████████████████████████▊                                                                                                                                | 1869/5680 [4:58:23<12:23:40, 11.71s/it] 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1870/5680 [4:58:35<12:21:10, 11.67s/it]                                                                                                                                                                                                                                             {'loss': '0.6103', 'grad_norm': '0.3029', 'learning_rate': '0.0001512', 'ppl': '1.841', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.2', 'tokens/total': 15319040, 'tokens/trainable': 15165732, 'epoch': '2.097'}
 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1870/5680 [4:58:35<12:21:10, 11.67s/it] 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1871/5680 [4:58:46<12:08:32, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.7252', 'grad_norm': '0.3001', 'learning_rate': '0.0001511', 'ppl': '2.065', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '716.1', 'tokens/total': 15327232, 'tokens/trainable': 15173614, 'epoch': '2.097'}
 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1871/5680 [4:58:46<12:08:32, 11.48s/it] 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1872/5680 [4:58:59<12:29:35, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.6162', 'grad_norm': '0.3003', 'learning_rate': '0.0001511', 'ppl': '1.852', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '628.6', 'tokens/total': 15335424, 'tokens/trainable': 15181526, 'epoch': '2.097'}
 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1872/5680 [4:58:59<12:29:35, 11.81s/it] 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1873/5680 [4:59:10<12:17:57, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.7547', 'grad_norm': '0.4333', 'learning_rate': '0.000151', 'ppl': '2.127', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '705.5', 'tokens/total': 15343616, 'tokens/trainable': 15189429, 'epoch': '2.098'}
 33%|██████████████████████████████████████████████████████████████▉                                                                                                                                | 1873/5680 [4:59:10<12:17:57, 11.63s/it] 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1874/5680 [4:59:21<12:16:08, 11.61s/it]                                                                                                                                                                                                                                             {'loss': '0.541', 'grad_norm': '0.285', 'learning_rate': '0.000151', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '675.8', 'tokens/total': 15351808, 'tokens/trainable': 15197225, 'epoch': '2.098'}
 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1874/5680 [4:59:21<12:16:08, 11.61s/it] 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1875/5680 [4:59:34<12:31:48, 11.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5327', 'grad_norm': '0.287', 'learning_rate': '0.0001509', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '642.2', 'tokens/total': 15360000, 'tokens/trainable': 15205209, 'epoch': '2.098'}
 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1875/5680 [4:59:34<12:31:48, 11.86s/it] 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1876/5680 [4:59:44<12:08:10, 11.49s/it]                                                                                                                                                                                                                                             {'loss': '0.5306', 'grad_norm': '0.2504', 'learning_rate': '0.0001509', 'ppl': '1.7', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '747.9', 'tokens/total': 15368192, 'tokens/trainable': 15213148, 'epoch': '2.098'}
 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1876/5680 [4:59:44<12:08:10, 11.49s/it] 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1877/5680 [4:59:56<12:14:23, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6714', 'grad_norm': '0.2885', 'learning_rate': '0.0001508', 'ppl': '1.957', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '670', 'tokens/total': 15376384, 'tokens/trainable': 15221059, 'epoch': '2.098'}
 33%|███████████████████████████████████████████████████████████████                                                                                                                                | 1877/5680 [4:59:56<12:14:23, 11.59s/it] 33%|███████████████████████████████████████████████████████████████▏                                                                                                                               | 1878/5680 [5:00:09<12:32:40, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3001', 'grad_norm': '0.2118', 'learning_rate': '0.0001508', 'ppl': '1.35', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '637.2', 'tokens/total': 15384576, 'tokens/trainable': 15229057, 'epoch': '2.098'}
 33%|███████████████████████████████████████████████████████████████▏                                                                                                                               | 1878/5680 [5:00:09<12:32:40, 11.88s/it] 33%|███████████████████████████████████████████████████████████████▏                                                                                                                               | 1879/5680 [5:00:19<12:00:51, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.5938', 'grad_norm': '0.2417', 'learning_rate': '0.0001507', 'ppl': '1.811', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '777.4', 'tokens/total': 15392768, 'tokens/trainable': 15236997, 'epoch': '2.099'}
 33%|███████████████████████████████████████████████████████████████▏                                                                                                                               | 1879/5680 [5:00:19<12:00:51, 11.38s/it] 33%|███████████████████████████████████████████████████████████████▏                                                                                                                               | 1880/5680 [5:00:31<12:18:22, 11.66s/it]                                                                                                                                                                                                                                             {'loss': '0.5046', 'grad_norm': '0.2565', 'learning_rate': '0.0001507', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '629.8', 'tokens/total': 15400960, 'tokens/trainable': 15244742, 'epoch': '2.099'}
 33%|███████████████████████████████████████████████████████████████▏                                                                                                                               | 1880/5680 [5:00:31<12:18:22, 11.66s/it] 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1881/5680 [5:00:44<12:27:33, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.518', 'grad_norm': '0.3113', 'learning_rate': '0.0001506', 'ppl': '1.679', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '662.5', 'tokens/total': 15409152, 'tokens/trainable': 15252787, 'epoch': '2.099'}
 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1881/5680 [5:00:44<12:27:33, 11.81s/it] 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1882/5680 [5:00:54<12:01:01, 11.39s/it]                                                                                                                                                                                                                                             {'loss': '0.8612', 'grad_norm': '0.312', 'learning_rate': '0.0001506', 'ppl': '2.366', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '742.7', 'tokens/total': 15417344, 'tokens/trainable': 15260515, 'epoch': '2.099'}
 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1882/5680 [5:00:54<12:01:01, 11.39s/it] 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1883/5680 [5:01:06<12:20:41, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.534', 'grad_norm': '0.2577', 'learning_rate': '0.0001505', 'ppl': '1.706', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.6', 'tokens/total': 15425536, 'tokens/trainable': 15268429, 'epoch': '2.099'}
 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1883/5680 [5:01:06<12:20:41, 11.70s/it] 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1884/5680 [5:01:18<12:17:08, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.7556', 'grad_norm': '0.2905', 'learning_rate': '0.0001505', 'ppl': '2.129', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '686.4', 'tokens/total': 15433728, 'tokens/trainable': 15276338, 'epoch': '2.099'}
 33%|███████████████████████████████████████████████████████████████▎                                                                                                                               | 1884/5680 [5:01:18<12:17:08, 11.65s/it] 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1885/5680 [5:01:29<12:04:45, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.5389', 'grad_norm': '0.2919', 'learning_rate': '0.0001504', 'ppl': '1.714', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '741.9', 'tokens/total': 15441920, 'tokens/trainable': 15284503, 'epoch': '2.1'}
 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1885/5680 [5:01:29<12:04:45, 11.46s/it] 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1886/5680 [5:01:41<12:23:49, 11.76s/it]                                                                                                                                                                                                                                             {'loss': '0.4543', 'grad_norm': '0.256', 'learning_rate': '0.0001504', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '635.6', 'tokens/total': 15450112, 'tokens/trainable': 15292426, 'epoch': '2.1'}
 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1886/5680 [5:01:41<12:23:49, 11.76s/it] 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1887/5680 [5:01:53<12:12:41, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.5515', 'grad_norm': '0.2551', 'learning_rate': '0.0001504', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '720.1', 'tokens/total': 15458304, 'tokens/trainable': 15300470, 'epoch': '2.1'}
 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1887/5680 [5:01:53<12:12:41, 11.59s/it] 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1888/5680 [5:02:04<12:10:10, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.5379', 'grad_norm': '0.274', 'learning_rate': '0.0001503', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '706.3', 'tokens/total': 15466496, 'tokens/trainable': 15308564, 'epoch': '2.1'}
 33%|███████████████████████████████████████████████████████████████▍                                                                                                                               | 1888/5680 [5:02:04<12:10:10, 11.55s/it] 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1889/5680 [5:02:17<12:28:47, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6245', 'grad_norm': '0.3329', 'learning_rate': '0.0001503', 'ppl': '1.867', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.6', 'tokens/total': 15474688, 'tokens/trainable': 15316659, 'epoch': '2.1'}
 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1889/5680 [5:02:17<12:28:47, 11.85s/it] 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1890/5680 [5:02:27<12:09:22, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.5514', 'grad_norm': '0.2948', 'learning_rate': '0.0001502', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '743.8', 'tokens/total': 15482880, 'tokens/trainable': 15324713, 'epoch': '2.101'}
 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1890/5680 [5:02:27<12:09:22, 11.55s/it] 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1891/5680 [5:02:39<12:13:29, 11.61s/it]                                                                                                                                                                                                                                             {'loss': '0.5406', 'grad_norm': '0.2608', 'learning_rate': '0.0001502', 'ppl': '1.717', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '682.7', 'tokens/total': 15491072, 'tokens/trainable': 15332744, 'epoch': '2.101'}
 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1891/5680 [5:02:39<12:13:29, 11.61s/it] 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1892/5680 [5:02:52<12:31:39, 11.91s/it]                                                                                                                                                                                                                                             {'loss': '0.854', 'grad_norm': '0.2914', 'learning_rate': '0.0001501', 'ppl': '2.349', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.4', 'tokens/total': 15499264, 'tokens/trainable': 15340848, 'epoch': '2.101'}
 33%|███████████████████████████████████████████████████████████████▌                                                                                                                               | 1892/5680 [5:02:52<12:31:39, 11.91s/it] 33%|███████████████████████████████████████████████████████████████▋                                                                                                                               | 1893/5680 [5:03:03<12:09:06, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.8582', 'grad_norm': '0.348', 'learning_rate': '0.0001501', 'ppl': '2.359', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '720.3', 'tokens/total': 15507456, 'tokens/trainable': 15348570, 'epoch': '2.101'}
 33%|███████████████████████████████████████████████████████████████▋                                                                                                                               | 1893/5680 [5:03:03<12:09:06, 11.55s/it] 33%|███████████████████████████████████████████████████████████████▋                                                                                                                               | 1894/5680 [5:03:14<12:16:02, 11.66s/it]                                                                                                                                                                                                                                             {'loss': '0.4801', 'grad_norm': '0.3261', 'learning_rate': '0.00015', 'ppl': '1.616', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '669.5', 'tokens/total': 15515648, 'tokens/trainable': 15356537, 'epoch': '2.101'}
 33%|███████████████████████████████████████████████████████████████▋                                                                                                                               | 1894/5680 [5:03:14<12:16:02, 11.66s/it] 33%|███████████████████████████████████████████████████████████████▋                                                                                                                               | 1895/5680 [5:03:27<12:32:33, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5865', 'grad_norm': '0.3028', 'learning_rate': '0.00015', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650', 'tokens/total': 15523840, 'tokens/trainable': 15364688, 'epoch': '2.101'}
 33%|███████████████████████████████████████████████████████████████▋                                                                                                                               | 1895/5680 [5:03:27<12:32:33, 11.93s/it] 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1896/5680 [5:03:37<11:57:50, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.6512', 'grad_norm': '0.2804', 'learning_rate': '0.0001499', 'ppl': '1.918', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '768.3', 'tokens/total': 15532032, 'tokens/trainable': 15372451, 'epoch': '2.102'}
 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1896/5680 [5:03:37<11:57:50, 11.38s/it] 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1897/5680 [5:03:49<12:15:03, 11.66s/it]                                                                                                                                                                                                                                             {'loss': '0.7736', 'grad_norm': '0.34', 'learning_rate': '0.0001499', 'ppl': '2.167', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '641.8', 'tokens/total': 15540224, 'tokens/trainable': 15380344, 'epoch': '2.102'}
 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1897/5680 [5:03:49<12:15:03, 11.66s/it] 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1898/5680 [5:04:02<12:26:30, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5675', 'grad_norm': '0.28', 'learning_rate': '0.0001498', 'ppl': '1.764', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '666.2', 'tokens/total': 15548416, 'tokens/trainable': 15388516, 'epoch': '2.102'}
 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1898/5680 [5:04:02<12:26:30, 11.84s/it] 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1899/5680 [5:04:12<12:01:39, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.399', 'grad_norm': '0.2417', 'learning_rate': '0.0001498', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '748.1', 'tokens/total': 15556608, 'tokens/trainable': 15396399, 'epoch': '2.102'}
 33%|███████████████████████████████████████████████████████████████▊                                                                                                                               | 1899/5680 [5:04:12<12:01:39, 11.45s/it] 33%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1900/5680 [5:04:25<12:21:03, 11.76s/it]                                                                                                                                                                                                                                             {'loss': '0.678', 'grad_norm': '0.3238', 'learning_rate': '0.0001497', 'ppl': '1.97', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.9', 'tokens/total': 15564800, 'tokens/trainable': 15404326, 'epoch': '2.102'}
 33%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1900/5680 [5:04:25<12:21:03, 11.76s/it] 33%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1901/5680 [5:04:37<12:22:27, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.7136', 'grad_norm': '0.2799', 'learning_rate': '0.0001497', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '680.1', 'tokens/total': 15572992, 'tokens/trainable': 15412378, 'epoch': '2.102'}
 33%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1901/5680 [5:04:37<12:22:27, 11.79s/it] 33%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1902/5680 [5:04:47<12:02:58, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.7794', 'grad_norm': '0.3208', 'learning_rate': '0.0001496', 'ppl': '2.18', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '758.7', 'tokens/total': 15581184, 'tokens/trainable': 15420538, 'epoch': '2.103'}
 33%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1902/5680 [5:04:47<12:02:58, 11.48s/it] 34%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1903/5680 [5:05:00<12:22:54, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.5043', 'grad_norm': '0.2539', 'learning_rate': '0.0001496', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.8', 'tokens/total': 15589376, 'tokens/trainable': 15428351, 'epoch': '2.103'}
 34%|███████████████████████████████████████████████████████████████▉                                                                                                                               | 1903/5680 [5:05:00<12:22:54, 11.80s/it] 34%|████████████████████████████████████████████████████████████████                                                                                                                               | 1904/5680 [5:05:11<12:17:25, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.5974', 'grad_norm': '0.2548', 'learning_rate': '0.0001495', 'ppl': '1.817', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678', 'tokens/total': 15597568, 'tokens/trainable': 15436160, 'epoch': '2.103'}
 34%|████████████████████████████████████████████████████████████████                                                                                                                               | 1904/5680 [5:05:11<12:17:25, 11.72s/it] 34%|████████████████████████████████████████████████████████████████                                                                                                                               | 1905/5680 [5:05:22<12:05:32, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.6999', 'grad_norm': '0.306', 'learning_rate': '0.0001495', 'ppl': '2.014', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '681', 'tokens/total': 15605760, 'tokens/trainable': 15443703, 'epoch': '2.103'}
 34%|████████████████████████████████████████████████████████████████                                                                                                                               | 1905/5680 [5:05:22<12:05:32, 11.53s/it] 34%|████████████████████████████████████████████████████████████████                                                                                                                               | 1906/5680 [5:05:35<12:23:45, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.4219', 'grad_norm': '0.2336', 'learning_rate': '0.0001494', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '642.1', 'tokens/total': 15613952, 'tokens/trainable': 15451732, 'epoch': '2.103'}
 34%|████████████████████████████████████████████████████████████████                                                                                                                               | 1906/5680 [5:05:35<12:23:45, 11.82s/it] 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1907/5680 [5:05:46<12:09:38, 11.60s/it]                                                                                                                                                                                                                                             {'loss': '0.384', 'grad_norm': '0.2352', 'learning_rate': '0.0001494', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '705.1', 'tokens/total': 15622144, 'tokens/trainable': 15459545, 'epoch': '2.104'}
 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1907/5680 [5:05:46<12:09:38, 11.60s/it] 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1908/5680 [5:05:57<12:05:10, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.5112', 'grad_norm': '0.3172', 'learning_rate': '0.0001493', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.9', 'tokens/total': 15630336, 'tokens/trainable': 15467525, 'epoch': '2.104'}
 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1908/5680 [5:05:57<12:05:10, 11.54s/it] 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1909/5680 [5:06:10<12:24:50, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7664', 'grad_norm': '0.3503', 'learning_rate': '0.0001493', 'ppl': '2.152', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '640', 'tokens/total': 15638528, 'tokens/trainable': 15475578, 'epoch': '2.104'}
 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1909/5680 [5:06:10<12:24:50, 11.85s/it] 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1910/5680 [5:06:21<12:01:47, 11.49s/it]                                                                                                                                                                                                                                             {'loss': '0.5636', 'grad_norm': '0.2981', 'learning_rate': '0.0001492', 'ppl': '1.757', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '714.4', 'tokens/total': 15646720, 'tokens/trainable': 15483174, 'epoch': '2.104'}
 34%|████████████████████████████████████████████████████████████████▏                                                                                                                              | 1910/5680 [5:06:21<12:01:47, 11.49s/it] 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1911/5680 [5:06:32<12:07:19, 11.58s/it]                                                                                                                                                                                                                                             {'loss': '0.6894', 'grad_norm': '0.2798', 'learning_rate': '0.0001492', 'ppl': '1.993', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '690.1', 'tokens/total': 15654912, 'tokens/trainable': 15491304, 'epoch': '2.104'}
 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1911/5680 [5:06:32<12:07:19, 11.58s/it] 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1912/5680 [5:06:45<12:25:43, 11.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6319', 'grad_norm': '0.3048', 'learning_rate': '0.0001492', 'ppl': '1.881', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.3', 'tokens/total': 15663104, 'tokens/trainable': 15499271, 'epoch': '2.104'}
 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1912/5680 [5:06:45<12:25:43, 11.87s/it] 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1913/5680 [5:06:55<11:55:31, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.6234', 'grad_norm': '0.3429', 'learning_rate': '0.0001491', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '760', 'tokens/total': 15671296, 'tokens/trainable': 15507084, 'epoch': '2.105'}
 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1913/5680 [5:06:55<11:55:31, 11.40s/it] 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1914/5680 [5:07:08<12:11:09, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.4418', 'grad_norm': '0.2342', 'learning_rate': '0.0001491', 'ppl': '1.555', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.4', 'tokens/total': 15679488, 'tokens/trainable': 15515031, 'epoch': '2.105'}
 34%|████████████████████████████████████████████████████████████████▎                                                                                                                              | 1914/5680 [5:07:08<12:11:09, 11.65s/it] 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1915/5680 [5:07:20<12:24:43, 11.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4755', 'grad_norm': '0.2465', 'learning_rate': '0.000149', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '660.1', 'tokens/total': 15687680, 'tokens/trainable': 15523200, 'epoch': '2.105'}
 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1915/5680 [5:07:20<12:24:43, 11.87s/it] 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1916/5680 [5:07:30<11:55:21, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.5097', 'grad_norm': '0.2419', 'learning_rate': '0.000149', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '777.6', 'tokens/total': 15695872, 'tokens/trainable': 15531214, 'epoch': '2.105'}
 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1916/5680 [5:07:30<11:55:21, 11.40s/it] 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1917/5680 [5:07:43<12:17:48, 11.76s/it]                                                                                                                                                                                                                                             {'loss': '0.5746', 'grad_norm': '0.2576', 'learning_rate': '0.0001489', 'ppl': '1.776', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '604.1', 'tokens/total': 15704064, 'tokens/trainable': 15538826, 'epoch': '2.105'}
 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1917/5680 [5:07:43<12:17:48, 11.76s/it] 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1918/5680 [5:07:55<12:20:25, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.6768', 'grad_norm': '0.3074', 'learning_rate': '0.0001489', 'ppl': '1.968', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '681.9', 'tokens/total': 15712256, 'tokens/trainable': 15546947, 'epoch': '2.105'}
 34%|████████████████████████████████████████████████████████████████▍                                                                                                                              | 1918/5680 [5:07:55<12:20:25, 11.81s/it] 34%|████████████████████████████████████████████████████████████████▌                                                                                                                              | 1919/5680 [5:08:06<12:01:24, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.5385', 'grad_norm': '0.2517', 'learning_rate': '0.0001488', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '726.5', 'tokens/total': 15720448, 'tokens/trainable': 15554782, 'epoch': '2.106'}
 34%|████████████████████████████████████████████████████████████████▌                                                                                                                              | 1919/5680 [5:08:06<12:01:24, 11.51s/it] 34%|████████████████████████████████████████████████████████████████▌                                                                                                                              | 1920/5680 [5:08:18<12:19:00, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.7751', 'grad_norm': '0.3283', 'learning_rate': '0.0001488', 'ppl': '2.171', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.3', 'tokens/total': 15728640, 'tokens/trainable': 15562842, 'epoch': '2.106'}
 34%|████████████████████████████████████████████████████████████████▌                                                                                                                              | 1920/5680 [5:08:18<12:19:00, 11.79s/it] 34%|████████████████████████████████████████████████████████████████▌                                                                                                                              | 1921/5680 [5:08:30<12:16:21, 11.75s/it]                                                                                                                                                                                                                                             {'loss': '0.8784', 'grad_norm': '0.356', 'learning_rate': '0.0001487', 'ppl': '2.407', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.3', 'tokens/total': 15736832, 'tokens/trainable': 15571016, 'epoch': '2.106'}
 34%|████████████████████████████████████████████████████████████████▌                                                                                                                              | 1921/5680 [5:08:30<12:16:21, 11.75s/it] 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1922/5680 [5:08:41<12:00:23, 11.50s/it]                                                                                                                                                                                                                                             {'loss': '0.6949', 'grad_norm': '0.2694', 'learning_rate': '0.0001487', 'ppl': '2.003', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '729.1', 'tokens/total': 15745024, 'tokens/trainable': 15578971, 'epoch': '2.106'}
 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1922/5680 [5:08:41<12:00:23, 11.50s/it] 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1923/5680 [5:08:53<12:19:46, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.9499', 'grad_norm': '0.3063', 'learning_rate': '0.0001486', 'ppl': '2.585', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.8', 'tokens/total': 15753216, 'tokens/trainable': 15587057, 'epoch': '2.106'}
 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1923/5680 [5:08:53<12:19:46, 11.81s/it] 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1924/5680 [5:09:04<12:05:48, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6138', 'grad_norm': '0.2526', 'learning_rate': '0.0001486', 'ppl': '1.847', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '732.5', 'tokens/total': 15761408, 'tokens/trainable': 15595168, 'epoch': '2.107'}
 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1924/5680 [5:09:04<12:05:48, 11.59s/it] 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1925/5680 [5:09:16<12:03:55, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.9324', 'grad_norm': '0.3011', 'learning_rate': '0.0001485', 'ppl': '2.541', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '708.9', 'tokens/total': 15769600, 'tokens/trainable': 15603316, 'epoch': '2.107'}
 34%|████████████████████████████████████████████████████████████████▋                                                                                                                              | 1925/5680 [5:09:16<12:03:55, 11.57s/it] 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1926/5680 [5:09:28<12:19:35, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.6661', 'grad_norm': '0.2936', 'learning_rate': '0.0001485', 'ppl': '1.947', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.4', 'tokens/total': 15777792, 'tokens/trainable': 15611311, 'epoch': '2.107'}
 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1926/5680 [5:09:28<12:19:35, 11.82s/it] 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1927/5680 [5:09:39<11:56:44, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.5806', 'grad_norm': '0.3115', 'learning_rate': '0.0001484', 'ppl': '1.787', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '736.7', 'tokens/total': 15785984, 'tokens/trainable': 15619129, 'epoch': '2.107'}
 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1927/5680 [5:09:39<11:56:44, 11.46s/it] 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1928/5680 [5:09:51<12:03:18, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.483', 'grad_norm': '0.2389', 'learning_rate': '0.0001484', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '677.5', 'tokens/total': 15794176, 'tokens/trainable': 15627128, 'epoch': '2.107'}
 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1928/5680 [5:09:51<12:03:18, 11.57s/it] 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1929/5680 [5:10:03<12:21:30, 11.86s/it]                                                                                                                                                                                                                                             {'loss': '0.423', 'grad_norm': '0.2881', 'learning_rate': '0.0001483', 'ppl': '1.527', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '604.8', 'tokens/total': 15802368, 'tokens/trainable': 15634713, 'epoch': '2.107'}
 34%|████████████████████████████████████████████████████████████████▊                                                                                                                              | 1929/5680 [5:10:03<12:21:30, 11.86s/it] 34%|████████████████████████████████████████████████████████████████▉                                                                                                                              | 1930/5680 [5:10:13<11:52:41, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.7733', 'grad_norm': '0.2684', 'learning_rate': '0.0001483', 'ppl': '2.167', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '783', 'tokens/total': 15810560, 'tokens/trainable': 15642799, 'epoch': '2.108'}
 34%|████████████████████████████████████████████████████████████████▉                                                                                                                              | 1930/5680 [5:10:13<11:52:41, 11.40s/it] 34%|████████████████████████████████████████████████████████████████▉                                                                                                                              | 1931/5680 [5:10:26<12:07:43, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.4853', 'grad_norm': '0.2823', 'learning_rate': '0.0001482', 'ppl': '1.625', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '622.9', 'tokens/total': 15818752, 'tokens/trainable': 15650394, 'epoch': '2.108'}
 34%|████████████████████████████████████████████████████████████████▉                                                                                                                              | 1931/5680 [5:10:26<12:07:43, 11.65s/it] 34%|████████████████████████████████████████████████████████████████▉                                                                                                                              | 1932/5680 [5:10:38<12:19:15, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.816', 'grad_norm': '0.3565', 'learning_rate': '0.0001482', 'ppl': '2.261', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '663.1', 'tokens/total': 15826944, 'tokens/trainable': 15658528, 'epoch': '2.108'}
 34%|████████████████████████████████████████████████████████████████▉                                                                                                                              | 1932/5680 [5:10:38<12:19:15, 11.83s/it] 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1933/5680 [5:10:48<11:52:09, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.4857', 'grad_norm': '0.2809', 'learning_rate': '0.0001481', 'ppl': '1.625', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '756.8', 'tokens/total': 15835136, 'tokens/trainable': 15666380, 'epoch': '2.108'}
 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1933/5680 [5:10:48<11:52:09, 11.40s/it] 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1934/5680 [5:11:01<12:10:12, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.5828', 'grad_norm': '0.2784', 'learning_rate': '0.0001481', 'ppl': '1.791', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655.6', 'tokens/total': 15843328, 'tokens/trainable': 15674492, 'epoch': '2.108'}
 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1934/5680 [5:11:01<12:10:12, 11.70s/it] 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1935/5680 [5:11:13<12:11:41, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.8367', 'grad_norm': '0.2984', 'learning_rate': '0.000148', 'ppl': '2.309', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '664.1', 'tokens/total': 15851520, 'tokens/trainable': 15682314, 'epoch': '2.108'}
 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1935/5680 [5:11:13<12:11:41, 11.72s/it] 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1936/5680 [5:11:23<11:54:10, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.4753', 'grad_norm': '0.2471', 'learning_rate': '0.000148', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '755.7', 'tokens/total': 15859712, 'tokens/trainable': 15690468, 'epoch': '2.109'}
 34%|█████████████████████████████████████████████████████████████████                                                                                                                              | 1936/5680 [5:11:23<11:54:10, 11.45s/it] 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1937/5680 [5:11:36<12:13:20, 11.76s/it]                                                                                                                                                                                                                                             {'loss': '0.8541', 'grad_norm': '0.347', 'learning_rate': '0.0001479', 'ppl': '2.349', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656.6', 'tokens/total': 15867904, 'tokens/trainable': 15698657, 'epoch': '2.109'}
 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1937/5680 [5:11:36<12:13:20, 11.76s/it] 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1938/5680 [5:11:47<12:06:29, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5779', 'grad_norm': '0.3307', 'learning_rate': '0.0001479', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '712.7', 'tokens/total': 15876096, 'tokens/trainable': 15706778, 'epoch': '2.109'}
 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1938/5680 [5:11:47<12:06:29, 11.65s/it] 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1939/5680 [5:11:58<11:59:16, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.5993', 'grad_norm': '0.3657', 'learning_rate': '0.0001478', 'ppl': '1.821', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '721.3', 'tokens/total': 15884288, 'tokens/trainable': 15714908, 'epoch': '2.109'}
 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1939/5680 [5:11:58<11:59:16, 11.54s/it] 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1940/5680 [5:12:11<12:15:39, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.7583', 'grad_norm': '0.2724', 'learning_rate': '0.0001478', 'ppl': '2.135', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '634.9', 'tokens/total': 15892480, 'tokens/trainable': 15722791, 'epoch': '2.109'}
 34%|█████████████████████████████████████████████████████████████████▏                                                                                                                             | 1940/5680 [5:12:11<12:15:39, 11.80s/it] 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1941/5680 [5:12:22<12:00:55, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.5409', 'grad_norm': '0.2467', 'learning_rate': '0.0001477', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.5', 'tokens/total': 15900672, 'tokens/trainable': 15730673, 'epoch': '2.11'}
 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1941/5680 [5:12:22<12:00:55, 11.57s/it] 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1942/5680 [5:12:34<12:01:46, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.4701', 'grad_norm': '0.2338', 'learning_rate': '0.0001477', 'ppl': '1.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '679.4', 'tokens/total': 15908864, 'tokens/trainable': 15738555, 'epoch': '2.11'}
 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1942/5680 [5:12:34<12:01:46, 11.59s/it] 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1943/5680 [5:12:46<12:19:48, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6201', 'grad_norm': '0.2448', 'learning_rate': '0.0001477', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '648.6', 'tokens/total': 15917056, 'tokens/trainable': 15746700, 'epoch': '2.11'}
 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1943/5680 [5:12:46<12:19:48, 11.88s/it] 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1944/5680 [5:12:57<12:01:05, 11.58s/it]                                                                                                                                                                                                                                             {'loss': '0.903', 'grad_norm': '0.307', 'learning_rate': '0.0001476', 'ppl': '2.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '727.7', 'tokens/total': 15925248, 'tokens/trainable': 15754617, 'epoch': '2.11'}
 34%|█████████████████████████████████████████████████████████████████▎                                                                                                                             | 1944/5680 [5:12:57<12:01:05, 11.58s/it] 34%|█████████████████████████████████████████████████████████████████▍                                                                                                                             | 1945/5680 [5:13:09<12:03:20, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.762', 'grad_norm': '0.3107', 'learning_rate': '0.0001476', 'ppl': '2.143', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '692', 'tokens/total': 15933440, 'tokens/trainable': 15762714, 'epoch': '2.11'}
 34%|█████████████████████████████████████████████████████████████████▍                                                                                                                             | 1945/5680 [5:13:09<12:03:20, 11.62s/it] 34%|█████████████████████████████████████████████████████████████████▍                                                                                                                             | 1946/5680 [5:13:21<12:20:27, 11.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4557', 'grad_norm': '0.2531', 'learning_rate': '0.0001475', 'ppl': '1.577', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '607.8', 'tokens/total': 15941632, 'tokens/trainable': 15770332, 'epoch': '2.11'}
 34%|█████████████████████████████████████████████████████████████████▍                                                                                                                             | 1946/5680 [5:13:21<12:20:27, 11.90s/it] 34%|█████████████████████████████████████████████████████████████████▍                                                                                                                             | 1947/5680 [5:13:32<11:55:13, 11.50s/it]                                                                                                                                                                                                                                             {'loss': '0.673', 'grad_norm': '0.3024', 'learning_rate': '0.0001475', 'ppl': '1.96', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '733.6', 'tokens/total': 15949824, 'tokens/trainable': 15778069, 'epoch': '2.111'}
 34%|█████████████████████████████████████████████████████████████████▍                                                                                                                             | 1947/5680 [5:13:32<11:55:13, 11.50s/it] 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1948/5680 [5:13:44<12:07:39, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.934', 'grad_norm': '0.3709', 'learning_rate': '0.0001474', 'ppl': '2.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '664', 'tokens/total': 15958016, 'tokens/trainable': 15786143, 'epoch': '2.111'}
 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1948/5680 [5:13:44<12:07:39, 11.70s/it] 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1949/5680 [5:13:57<12:23:59, 11.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3823', 'grad_norm': '0.2435', 'learning_rate': '0.0001474', 'ppl': '1.466', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '591.8', 'tokens/total': 15966208, 'tokens/trainable': 15793586, 'epoch': '2.111'}
 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1949/5680 [5:13:57<12:23:59, 11.96s/it] 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1950/5680 [5:14:07<11:49:39, 11.42s/it]                                                                                                                                                                                                                                             {'loss': '0.5231', 'grad_norm': '0.2612', 'learning_rate': '0.0001473', 'ppl': '1.687', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '775.2', 'tokens/total': 15974400, 'tokens/trainable': 15801431, 'epoch': '2.111'}
 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1950/5680 [5:14:07<11:49:39, 11.42s/it] 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1951/5680 [5:14:19<12:08:49, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.283', 'grad_norm': '0.2261', 'learning_rate': '0.0001473', 'ppl': '1.327', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '606.2', 'tokens/total': 15982592, 'tokens/trainable': 15808979, 'epoch': '2.111'}
 34%|█████████████████████████████████████████████████████████████████▌                                                                                                                             | 1951/5680 [5:14:19<12:08:49, 11.73s/it] 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1952/5680 [5:14:31<12:14:25, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.8194', 'grad_norm': '0.3409', 'learning_rate': '0.0001472', 'ppl': '2.269', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '672.7', 'tokens/total': 15990784, 'tokens/trainable': 15817075, 'epoch': '2.111'}
 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1952/5680 [5:14:31<12:14:25, 11.82s/it] 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1953/5680 [5:14:42<11:50:25, 11.44s/it]                                                                                                                                                                                                                                             {'loss': '0.5316', 'grad_norm': '0.2792', 'learning_rate': '0.0001472', 'ppl': '1.702', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '719.1', 'tokens/total': 15998976, 'tokens/trainable': 15824647, 'epoch': '2.112'}
 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1953/5680 [5:14:42<11:50:25, 11.44s/it] 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1954/5680 [5:14:54<12:12:22, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.8956', 'grad_norm': '0.3963', 'learning_rate': '0.0001471', 'ppl': '2.449', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '627.4', 'tokens/total': 16007168, 'tokens/trainable': 15832565, 'epoch': '2.112'}
 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1954/5680 [5:14:54<12:12:22, 11.79s/it] 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1955/5680 [5:15:06<12:12:36, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.5124', 'grad_norm': '0.2909', 'learning_rate': '0.0001471', 'ppl': '1.669', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '663.5', 'tokens/total': 16015360, 'tokens/trainable': 15840404, 'epoch': '2.112'}
 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                             | 1955/5680 [5:15:06<12:12:36, 11.80s/it] 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1956/5680 [5:15:17<11:53:57, 11.50s/it]                                                                                                                                                                                                                                             {'loss': '0.9362', 'grad_norm': '0.3905', 'learning_rate': '0.000147', 'ppl': '2.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '751.4', 'tokens/total': 16023552, 'tokens/trainable': 15848516, 'epoch': '2.112'}
 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1956/5680 [5:15:17<11:53:57, 11.50s/it] 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1957/5680 [5:15:29<12:12:03, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.6709', 'grad_norm': '0.2847', 'learning_rate': '0.000147', 'ppl': '1.956', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '609.6', 'tokens/total': 16031744, 'tokens/trainable': 15856126, 'epoch': '2.112'}
 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1957/5680 [5:15:29<12:12:03, 11.80s/it] 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1958/5680 [5:15:41<12:06:17, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.504', 'grad_norm': '0.2788', 'learning_rate': '0.0001469', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.5', 'tokens/total': 16039936, 'tokens/trainable': 15863923, 'epoch': '2.112'}
 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1958/5680 [5:15:41<12:06:17, 11.71s/it] 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1959/5680 [5:15:52<11:56:24, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.6668', 'grad_norm': '0.3111', 'learning_rate': '0.0001469', 'ppl': '1.948', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '702.7', 'tokens/total': 16048128, 'tokens/trainable': 15871774, 'epoch': '2.113'}
 34%|█████████████████████████████████████████████████████████████████▊                                                                                                                             | 1959/5680 [5:15:52<11:56:24, 11.55s/it] 35%|█████████████████████████████████████████████████████████████████▉                                                                                                                             | 1960/5680 [5:16:05<12:19:28, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.9547', 'grad_norm': '0.3366', 'learning_rate': '0.0001468', 'ppl': '2.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '589', 'tokens/total': 16056320, 'tokens/trainable': 15879313, 'epoch': '2.113'}
 35%|█████████████████████████████████████████████████████████████████▉                                                                                                                             | 1960/5680 [5:16:05<12:19:28, 11.93s/it] 35%|█████████████████████████████████████████████████████████████████▉                                                                                                                             | 1961/5680 [5:16:16<12:07:05, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.5473', 'grad_norm': '0.266', 'learning_rate': '0.0001468', 'ppl': '1.729', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '716.6', 'tokens/total': 16064512, 'tokens/trainable': 15887384, 'epoch': '2.113'}
 35%|█████████████████████████████████████████████████████████████████▉                                                                                                                             | 1961/5680 [5:16:16<12:07:05, 11.73s/it] 35%|█████████████████████████████████████████████████████████████████▉                                                                                                                             | 1962/5680 [5:16:28<11:58:50, 11.60s/it]                                                                                                                                                                                                                                             {'loss': '0.4329', 'grad_norm': '0.2514', 'learning_rate': '0.0001467', 'ppl': '1.542', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '690.2', 'tokens/total': 16072704, 'tokens/trainable': 15895180, 'epoch': '2.113'}
 35%|█████████████████████████████████████████████████████████████████▉                                                                                                                             | 1962/5680 [5:16:28<11:58:50, 11.60s/it] 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1963/5680 [5:16:40<12:14:49, 11.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4457', 'grad_norm': '0.2566', 'learning_rate': '0.0001467', 'ppl': '1.562', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '616.7', 'tokens/total': 16080896, 'tokens/trainable': 15902866, 'epoch': '2.113'}
 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1963/5680 [5:16:40<12:14:49, 11.86s/it] 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1964/5680 [5:16:51<11:54:50, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.6339', 'grad_norm': '0.3192', 'learning_rate': '0.0001466', 'ppl': '1.885', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '740.9', 'tokens/total': 16089088, 'tokens/trainable': 15910861, 'epoch': '2.114'}
 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1964/5680 [5:16:51<11:54:50, 11.54s/it] 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1965/5680 [5:17:02<11:56:13, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.7986', 'grad_norm': '0.276', 'learning_rate': '0.0001466', 'ppl': '2.222', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '686.6', 'tokens/total': 16097280, 'tokens/trainable': 15918836, 'epoch': '2.114'}
 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1965/5680 [5:17:02<11:56:13, 11.57s/it] 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1966/5680 [5:17:15<12:13:25, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4136', 'grad_norm': '0.227', 'learning_rate': '0.0001465', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '624.5', 'tokens/total': 16105472, 'tokens/trainable': 15926642, 'epoch': '2.114'}
 35%|██████████████████████████████████████████████████████████████████                                                                                                                             | 1966/5680 [5:17:15<12:13:25, 11.85s/it] 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1967/5680 [5:17:25<11:45:30, 11.40s/it]                                                                                                                                                                                                                                             {'loss': '0.5254', 'grad_norm': '0.2906', 'learning_rate': '0.0001465', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '752.4', 'tokens/total': 16113664, 'tokens/trainable': 15934429, 'epoch': '2.114'}
 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1967/5680 [5:17:25<11:45:30, 11.40s/it] 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1968/5680 [5:17:37<12:00:08, 11.64s/it]                                                                                                                                                                                                                                             {'loss': '0.9845', 'grad_norm': '0.35', 'learning_rate': '0.0001464', 'ppl': '2.676', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '643.2', 'tokens/total': 16121856, 'tokens/trainable': 15942268, 'epoch': '2.114'}
 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1968/5680 [5:17:37<12:00:08, 11.64s/it] 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1969/5680 [5:17:50<12:14:45, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5192', 'grad_norm': '0.3098', 'learning_rate': '0.0001464', 'ppl': '1.681', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '613', 'tokens/total': 16130048, 'tokens/trainable': 15949884, 'epoch': '2.114'}
 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1969/5680 [5:17:50<12:14:45, 11.88s/it] 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1970/5680 [5:18:00<11:43:04, 11.37s/it]                                                                                                                                                                                                                                             {'loss': '0.6842', 'grad_norm': '0.3006', 'learning_rate': '0.0001463', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '788.6', 'tokens/total': 16138240, 'tokens/trainable': 15957899, 'epoch': '2.115'}
 35%|██████████████████████████████████████████████████████████████████▏                                                                                                                            | 1970/5680 [5:18:00<11:43:04, 11.37s/it] 35%|██████████████████████████████████████████████████████████████████▎                                                                                                                            | 1971/5680 [5:18:13<12:02:18, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.7921', 'grad_norm': '0.3515', 'learning_rate': '0.0001463', 'ppl': '2.208', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '616.3', 'tokens/total': 16146432, 'tokens/trainable': 15965549, 'epoch': '2.115'}
 35%|██████████████████████████████████████████████████████████████████▎                                                                                                                            | 1971/5680 [5:18:13<12:02:18, 11.68s/it] 35%|██████████████████████████████████████████████████████████████████▎                                                                                                                            | 1972/5680 [5:18:25<12:08:19, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '1.103', 'grad_norm': '0.3666', 'learning_rate': '0.0001462', 'ppl': '3.013', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '676.4', 'tokens/total': 16154624, 'tokens/trainable': 15973672, 'epoch': '2.115'}
 35%|██████████████████████████████████████████████████████████████████▎                                                                                                                            | 1972/5680 [5:18:25<12:08:19, 11.79s/it] 35%|██████████████████████████████████████████████████████████████████▎                                                                                                                            | 1973/5680 [5:18:35<11:47:12, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.5576', 'grad_norm': '0.2799', 'learning_rate': '0.0001462', 'ppl': '1.746', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '698', 'tokens/total': 16162816, 'tokens/trainable': 15981109, 'epoch': '2.115'}
 35%|██████████████████████████████████████████████████████████████████▎                                                                                                                            | 1973/5680 [5:18:35<11:47:12, 11.45s/it] 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1974/5680 [5:18:48<12:05:18, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.7034', 'grad_norm': '0.3019', 'learning_rate': '0.0001461', 'ppl': '2.021', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '643.5', 'tokens/total': 16171008, 'tokens/trainable': 15989109, 'epoch': '2.115'}
 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1974/5680 [5:18:48<12:05:18, 11.74s/it] 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1975/5680 [5:18:59<11:59:10, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5559', 'grad_norm': '0.3737', 'learning_rate': '0.0001461', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.5', 'tokens/total': 16179200, 'tokens/trainable': 15996478, 'epoch': '2.115'}
 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1975/5680 [5:18:59<11:59:10, 11.65s/it] 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1976/5680 [5:19:10<11:47:51, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.7846', 'grad_norm': '0.3722', 'learning_rate': '0.000146', 'ppl': '2.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '694.7', 'tokens/total': 16187392, 'tokens/trainable': 16004144, 'epoch': '2.116'}
 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1976/5680 [5:19:10<11:47:51, 11.47s/it] 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1977/5680 [5:19:23<12:07:33, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.6683', 'grad_norm': '0.2758', 'learning_rate': '0.000146', 'ppl': '1.951', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '630.9', 'tokens/total': 16195584, 'tokens/trainable': 16012052, 'epoch': '2.116'}
 35%|██████████████████████████████████████████████████████████████████▍                                                                                                                            | 1977/5680 [5:19:23<12:07:33, 11.79s/it] 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1978/5680 [5:19:34<11:54:40, 11.58s/it]                                                                                                                                                                                                                                             {'loss': '0.7141', 'grad_norm': '0.3042', 'learning_rate': '0.0001459', 'ppl': '2.042', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '694.2', 'tokens/total': 16203776, 'tokens/trainable': 16019756, 'epoch': '2.116'}
 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1978/5680 [5:19:34<11:54:40, 11.58s/it] 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1979/5680 [5:19:45<11:54:39, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.4394', 'grad_norm': '0.2492', 'learning_rate': '0.0001459', 'ppl': '1.552', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '636.1', 'tokens/total': 16211968, 'tokens/trainable': 16027036, 'epoch': '2.116'}
 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1979/5680 [5:19:45<11:54:39, 11.59s/it][2026-01-27 03:08:59,559] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:59846] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 03:09:00,886] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:59846] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:05<04:45, 19.53 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:05<02:08, 42.49 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:06<01:19, 66.99 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:06<00:56, 93.25 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:07<00:41, 123.20 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:07<00:35, 140.51 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:08<00:30, 159.48 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:08<00:30, 156.26 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:09<00:22, 209.41 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:09<00:21, 211.84 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:09<00:21, 214.80 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:10<00:19, 222.11 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:10<00:20, 213.06 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:11<00:19, 217.98 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:11<00:19, 208.21 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:12<00:18, 213.80 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:12<00:17, 222.12 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:13<00:16, 227.46 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:13<00:16, 223.53 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:14<00:16, 212.35 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:14<00:15, 222.34 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:15<00:16, 208.05 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:15<00:13, 248.94 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:16<00:13, 235.48 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:16<00:13, 228.75 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:17<00:13, 225.46 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:17<00:13, 214.38 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:18<00:12, 226.32 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:18<00:11, 227.57 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:18<00:11, 219.07 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:19<00:10, 221.65 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:19<00:10, 229.37 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:20<00:09, 227.31 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:20<00:09, 230.76 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:21<00:08, 225.87 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:21<00:08, 227.36 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:22<00:07, 237.23 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:22<00:06, 246.37 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:23<00:07, 217.45 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:23<00:06, 219.97 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:23<00:05, 255.76 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:24<00:05, 234.87 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:24<00:04, 253.72 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:25<00:04, 232.32 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:25<00:04, 215.85 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:26<00:03, 258.05 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:26<00:03, 232.26 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:26<00:02, 246.44 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:27<00:02, 242.06 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:27<00:01, 230.65 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:28<00:01, 246.44 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:28<00:00, 240.54 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:29<00:00, 239.91 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:29<00:00, 266.10 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:30<00:00, 186.85 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 912.13 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1304.15 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1506.53 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1649.50 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1690.89 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1754.92 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1569.11 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1433.72 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2070.55 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2407.26 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2648.05 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2738.33 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2432.50 examples/s]
[2026-01-27 03:09:37,546] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:59846] Using single process for pack_parallel, running sequentially.
[2026-01-27 03:09:42,794] [WARNING] [py.warnings._showwarnmsg:109] [PID:59846] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1980/5680 [5:20:40<25:14:08, 24.55s/it]                                                                                                                                                                                                                                             {'loss': '0.8129', 'grad_norm': '0.2922', 'learning_rate': '0.0001458', 'ppl': '2.254', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.1', 'tokens/total': 16220160, 'tokens/trainable': 16034371, 'epoch': '3'}
 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1980/5680 [5:20:40<25:14:08, 24.55s/it][2026-01-27 03:09:54,309] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:60085] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 03:09:55,412] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:60085] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<03:31, 26.37 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:39, 55.14 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:04<01:00, 88.63 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:42, 123.93 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:32, 157.23 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:05<00:26, 189.31 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:06<00:22, 216.43 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:19, 244.89 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:06<00:17, 265.24 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:07<00:16, 278.80 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:15, 288.68 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:07<00:14, 296.32 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:08<00:14, 302.25 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:13, 304.76 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:08<00:12, 319.28 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:09<00:12, 316.41 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:09<00:12, 315.26 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:09<00:12, 314.51 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:10<00:11, 320.40 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:10<00:11, 313.95 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:10<00:10, 318.22 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:11<00:10, 318.49 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:11<00:10, 312.46 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:11<00:09, 318.59 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:12<00:09, 318.27 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:12<00:09, 318.73 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:12<00:08, 321.33 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:13<00:08, 318.63 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:13<00:08, 324.89 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:13<00:07, 320.48 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:14<00:07, 318.54 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:14<00:07, 328.27 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:14<00:07, 314.28 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:14<00:06, 331.77 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:15<00:06, 331.68 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:15<00:05, 334.24 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:15<00:05, 329.33 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:16<00:05, 324.47 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:16<00:04, 325.90 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:16<00:04, 326.73 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:17<00:04, 294.53 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:17<00:03, 335.73 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:17<00:03, 337.87 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:18<00:03, 329.09 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:18<00:02, 329.37 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:18<00:02, 330.65 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:19<00:02, 319.12 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:19<00:01, 332.48 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:19<00:01, 322.01 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:20<00:01, 342.68 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:20<00:00, 337.59 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:20<00:00, 327.99 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:20<00:00, 360.67 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:21<00:00, 370.05 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:21<00:00, 260.10 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 924.16 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1313.63 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1552.81 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1676.90 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1734.05 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1706.00 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1567.59 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1293.14 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1955.56 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2361.67 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2658.56 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2762.01 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2391.84 examples/s]
[2026-01-27 03:10:28,836] [WARNING] [py.warnings._showwarnmsg:109] [PID:60085] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1981/5680 [5:21:26<31:49:15, 30.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6448', 'grad_norm': '0.2727', 'learning_rate': '0.0001458', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '177.2', 'tokens/total': 16228352, 'tokens/trainable': 16042511, 'epoch': '3'}
 35%|██████████████████████████████████████████████████████████████████▌                                                                                                                            | 1981/5680 [5:21:26<31:49:15, 30.97s/it] 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1982/5680 [5:21:37<25:46:23, 25.09s/it]                                                                                                                                                                                                                                             {'loss': '0.6933', 'grad_norm': '0.2733', 'learning_rate': '0.0001457', 'ppl': '2', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '722', 'tokens/total': 16236544, 'tokens/trainable': 16050687, 'epoch': '3.001'}
 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1982/5680 [5:21:37<25:46:23, 25.09s/it] 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1983/5680 [5:21:50<21:53:59, 21.33s/it]                                                                                                                                                                                                                                             {'loss': '0.4771', 'grad_norm': '0.3063', 'learning_rate': '0.0001457', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.3', 'tokens/total': 16244736, 'tokens/trainable': 16058843, 'epoch': '3.001'}
 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1983/5680 [5:21:50<21:53:59, 21.33s/it] 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1984/5680 [5:22:01<18:41:25, 18.20s/it]                                                                                                                                                                                                                                             {'loss': '0.6004', 'grad_norm': '0.2679', 'learning_rate': '0.0001456', 'ppl': '1.823', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '747', 'tokens/total': 16252928, 'tokens/trainable': 16066997, 'epoch': '3.001'}
 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1984/5680 [5:22:01<18:41:25, 18.20s/it] 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1985/5680 [5:22:13<16:40:27, 16.25s/it]                                                                                                                                                                                                                                             {'loss': '0.6085', 'grad_norm': '0.2563', 'learning_rate': '0.0001456', 'ppl': '1.838', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.4', 'tokens/total': 16261120, 'tokens/trainable': 16075176, 'epoch': '3.001'}
 35%|██████████████████████████████████████████████████████████████████▋                                                                                                                            | 1985/5680 [5:22:13<16:40:27, 16.25s/it] 35%|██████████████████████████████████████████████████████████████████▊                                                                                                                            | 1986/5680 [5:22:25<15:32:18, 15.14s/it]                                                                                                                                                                                                                                             {'loss': '0.4527', 'grad_norm': '0.2713', 'learning_rate': '0.0001455', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.7', 'tokens/total': 16269312, 'tokens/trainable': 16083360, 'epoch': '3.001'}
 35%|██████████████████████████████████████████████████████████████████▊                                                                                                                            | 1986/5680 [5:22:25<15:32:18, 15.14s/it] 35%|██████████████████████████████████████████████████████████████████▊                                                                                                                            | 1987/5680 [5:22:36<14:11:37, 13.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5256', 'grad_norm': '0.259', 'learning_rate': '0.0001455', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '755.1', 'tokens/total': 16277504, 'tokens/trainable': 16091500, 'epoch': '3.001'}
 35%|██████████████████████████████████████████████████████████████████▊                                                                                                                            | 1987/5680 [5:22:36<14:11:37, 13.84s/it] 35%|██████████████████████████████████████████████████████████████████▊                                                                                                                            | 1988/5680 [5:22:48<13:34:19, 13.23s/it]                                                                                                                                                                                                                                             {'loss': '0.5132', 'grad_norm': '0.2852', 'learning_rate': '0.0001454', 'ppl': '1.671', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '692.6', 'tokens/total': 16285696, 'tokens/trainable': 16099682, 'epoch': '3.002'}
 35%|██████████████████████████████████████████████████████████████████▊                                                                                                                            | 1988/5680 [5:22:48<13:34:19, 13.23s/it] 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1989/5680 [5:23:00<13:23:06, 13.06s/it]                                                                                                                                                                                                                                             {'loss': '0.6643', 'grad_norm': '0.3438', 'learning_rate': '0.0001454', 'ppl': '1.943', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.9', 'tokens/total': 16293888, 'tokens/trainable': 16107843, 'epoch': '3.002'}
 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1989/5680 [5:23:00<13:23:06, 13.06s/it] 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1990/5680 [5:23:11<12:36:07, 12.29s/it]                                                                                                                                                                                                                                             {'loss': '0.4302', 'grad_norm': '0.262', 'learning_rate': '0.0001453', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '773.4', 'tokens/total': 16302080, 'tokens/trainable': 16115976, 'epoch': '3.002'}
 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1990/5680 [5:23:11<12:36:07, 12.29s/it] 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1991/5680 [5:23:23<12:32:31, 12.24s/it]                                                                                                                                                                                                                                             {'loss': '0.612', 'grad_norm': '0.2523', 'learning_rate': '0.0001453', 'ppl': '1.844', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '674.9', 'tokens/total': 16310272, 'tokens/trainable': 16124141, 'epoch': '3.002'}
 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1991/5680 [5:23:23<12:32:31, 12.24s/it] 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1992/5680 [5:23:35<12:35:02, 12.28s/it]                                                                                                                                                                                                                                             {'loss': '0.7401', 'grad_norm': '0.3385', 'learning_rate': '0.0001453', 'ppl': '2.096', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '660.6', 'tokens/total': 16318464, 'tokens/trainable': 16132320, 'epoch': '3.002'}
 35%|██████████████████████████████████████████████████████████████████▉                                                                                                                            | 1992/5680 [5:23:35<12:35:02, 12.28s/it] 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1993/5680 [5:23:46<11:55:58, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5051', 'grad_norm': '0.2661', 'learning_rate': '0.0001452', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '805.8', 'tokens/total': 16326656, 'tokens/trainable': 16140500, 'epoch': '3.002'}
 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1993/5680 [5:23:46<11:55:58, 11.65s/it] 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1994/5680 [5:23:58<12:10:21, 11.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8438', 'grad_norm': '0.306', 'learning_rate': '0.0001452', 'ppl': '2.325', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.2', 'tokens/total': 16334848, 'tokens/trainable': 16148626, 'epoch': '3.003'}
 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1994/5680 [5:23:58<12:10:21, 11.89s/it] 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1995/5680 [5:24:10<12:11:18, 11.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4773', 'grad_norm': '0.2271', 'learning_rate': '0.0001451', 'ppl': '1.612', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '679.5', 'tokens/total': 16343040, 'tokens/trainable': 16156742, 'epoch': '3.003'}
 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1995/5680 [5:24:10<12:11:18, 11.91s/it] 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1996/5680 [5:24:21<11:48:07, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.9717', 'grad_norm': '0.3667', 'learning_rate': '0.0001451', 'ppl': '2.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '766.9', 'tokens/total': 16351232, 'tokens/trainable': 16164908, 'epoch': '3.003'}
 35%|███████████████████████████████████████████████████████████████████                                                                                                                            | 1996/5680 [5:24:21<11:48:07, 11.53s/it] 35%|███████████████████████████████████████████████████████████████████▏                                                                                                                           | 1997/5680 [5:24:33<12:05:24, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.5114', 'grad_norm': '0.2765', 'learning_rate': '0.000145', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656', 'tokens/total': 16359424, 'tokens/trainable': 16173093, 'epoch': '3.003'}
 35%|███████████████████████████████████████████████████████████████████▏                                                                                                                           | 1997/5680 [5:24:33<12:05:24, 11.82s/it] 35%|███████████████████████████████████████████████████████████████████▏                                                                                                                           | 1998/5680 [5:24:45<11:58:16, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.511', 'grad_norm': '0.2634', 'learning_rate': '0.000145', 'ppl': '1.667', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.6', 'tokens/total': 16367616, 'tokens/trainable': 16181277, 'epoch': '3.003'}
 35%|███████████████████████████████████████████████████████████████████▏                                                                                                                           | 1998/5680 [5:24:45<11:58:16, 11.70s/it] 35%|███████████████████████████████████████████████████████████████████▏                                                                                                                           | 1999/5680 [5:24:56<11:46:03, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.6235', 'grad_norm': '0.2652', 'learning_rate': '0.0001449', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '739.4', 'tokens/total': 16375808, 'tokens/trainable': 16189439, 'epoch': '3.004'}
 35%|███████████████████████████████████████████████████████████████████▏                                                                                                                           | 1999/5680 [5:24:56<11:46:03, 11.51s/it] 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2000/5680 [5:25:08<12:04:42, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.5375', 'grad_norm': '0.2766', 'learning_rate': '0.0001449', 'ppl': '1.712', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650.8', 'tokens/total': 16384000, 'tokens/trainable': 16197589, 'epoch': '3.004'}
 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2000/5680 [5:25:08<12:04:42, 11.82s/it][2026-01-27 03:14:22,237] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 03:15:13,294] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out/checkpoint-2000
[2026-01-27 03:16:14,004] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 03:16:14,004] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2001/5680 [5:27:13<46:37:18, 45.62s/it]                                                                                                                                                                                                                                             {'loss': '0.5746', 'grad_norm': '0.2671', 'learning_rate': '0.0001448', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '689.6', 'tokens/total': 16392192, 'tokens/trainable': 16205766, 'epoch': '3.004'}
 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2001/5680 [5:27:13<46:37:18, 45.62s/it] 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2002/5680 [5:27:25<36:26:28, 35.67s/it]                                                                                                                                                                                                                                             {'loss': '0.4929', 'grad_norm': '0.2645', 'learning_rate': '0.0001448', 'ppl': '1.637', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.9', 'tokens/total': 16400384, 'tokens/trainable': 16213900, 'epoch': '3.004'}
 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2002/5680 [5:27:25<36:26:28, 35.67s/it] 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2003/5680 [5:27:37<29:08:48, 28.54s/it]                                                                                                                                                                                                                                             {'loss': '0.5665', 'grad_norm': '0.2703', 'learning_rate': '0.0001447', 'ppl': '1.762', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '685.7', 'tokens/total': 16408576, 'tokens/trainable': 16222055, 'epoch': '3.004'}
 35%|███████████████████████████████████████████████████████████████████▎                                                                                                                           | 2003/5680 [5:27:37<29:08:48, 28.54s/it] 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2004/5680 [5:27:48<23:43:41, 23.24s/it]                                                                                                                                                                                                                                             {'loss': '0.5233', 'grad_norm': '0.2803', 'learning_rate': '0.0001447', 'ppl': '1.688', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '748.2', 'tokens/total': 16416768, 'tokens/trainable': 16230181, 'epoch': '3.004'}
 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2004/5680 [5:27:48<23:43:41, 23.24s/it] 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2005/5680 [5:28:00<20:24:54, 20.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4389', 'grad_norm': '0.2261', 'learning_rate': '0.0001446', 'ppl': '1.551', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '657.2', 'tokens/total': 16424960, 'tokens/trainable': 16238349, 'epoch': '3.005'}
 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2005/5680 [5:28:00<20:24:54, 20.00s/it] 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2006/5680 [5:28:12<17:46:49, 17.42s/it]                                                                                                                                                                                                                                             {'loss': '0.4339', 'grad_norm': '0.2667', 'learning_rate': '0.0001446', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '717.8', 'tokens/total': 16433152, 'tokens/trainable': 16246539, 'epoch': '3.005'}
 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2006/5680 [5:28:12<17:46:49, 17.42s/it] 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2007/5680 [5:28:23<15:51:48, 15.55s/it]                                                                                                                                                                                                                                             {'loss': '0.6588', 'grad_norm': '0.3571', 'learning_rate': '0.0001445', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '730.6', 'tokens/total': 16441344, 'tokens/trainable': 16254695, 'epoch': '3.005'}
 35%|███████████████████████████████████████████████████████████████████▍                                                                                                                           | 2007/5680 [5:28:23<15:51:48, 15.55s/it] 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2008/5680 [5:28:35<14:56:03, 14.64s/it]                                                                                                                                                                                                                                             {'loss': '0.5254', 'grad_norm': '0.2754', 'learning_rate': '0.0001445', 'ppl': '1.691', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.1', 'tokens/total': 16449536, 'tokens/trainable': 16262867, 'epoch': '3.005'}
 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2008/5680 [5:28:35<14:56:03, 14.64s/it] 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2009/5680 [5:28:47<13:50:46, 13.58s/it]                                                                                                                                                                                                                                             {'loss': '0.5706', 'grad_norm': '0.3412', 'learning_rate': '0.0001444', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '733', 'tokens/total': 16457728, 'tokens/trainable': 16270996, 'epoch': '3.005'}
 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2009/5680 [5:28:47<13:50:46, 13.58s/it] 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2010/5680 [5:28:58<13:11:15, 12.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4843', 'grad_norm': '0.2524', 'learning_rate': '0.0001444', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.3', 'tokens/total': 16465920, 'tokens/trainable': 16279172, 'epoch': '3.005'}
 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2010/5680 [5:28:58<13:11:15, 12.94s/it] 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2011/5680 [5:29:11<13:04:35, 12.83s/it]                                                                                                                                                                                                                                             {'loss': '0.747', 'grad_norm': '0.2969', 'learning_rate': '0.0001443', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '646.3', 'tokens/total': 16474112, 'tokens/trainable': 16287304, 'epoch': '3.006'}
 35%|███████████████████████████████████████████████████████████████████▌                                                                                                                           | 2011/5680 [5:29:11<13:04:35, 12.83s/it] 35%|███████████████████████████████████████████████████████████████████▋                                                                                                                           | 2012/5680 [5:29:22<12:38:37, 12.41s/it]                                                                                                                                                                                                                                             {'loss': '0.4646', 'grad_norm': '0.2968', 'learning_rate': '0.0001443', 'ppl': '1.591', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '716.9', 'tokens/total': 16482304, 'tokens/trainable': 16295491, 'epoch': '3.006'}
 35%|███████████████████████████████████████████████████████████████████▋                                                                                                                           | 2012/5680 [5:29:22<12:38:37, 12.41s/it] 35%|███████████████████████████████████████████████████████████████████▋                                                                                                                           | 2013/5680 [5:29:33<12:20:19, 12.11s/it]                                                                                                                                                                                                                                             {'loss': '0.6837', 'grad_norm': '0.3745', 'learning_rate': '0.0001442', 'ppl': '1.981', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '715.7', 'tokens/total': 16490496, 'tokens/trainable': 16303655, 'epoch': '3.006'}
 35%|███████████████████████████████████████████████████████████████████▋                                                                                                                           | 2013/5680 [5:29:33<12:20:19, 12.11s/it] 35%|███████████████████████████████████████████████████████████████████▋                                                                                                                           | 2014/5680 [5:29:46<12:28:54, 12.26s/it]                                                                                                                                                                                                                                             {'loss': '0.6424', 'grad_norm': '0.3235', 'learning_rate': '0.0001442', 'ppl': '1.901', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.9', 'tokens/total': 16498688, 'tokens/trainable': 16311809, 'epoch': '3.006'}
 35%|███████████████████████████████████████████████████████████████████▋                                                                                                                           | 2014/5680 [5:29:46<12:28:54, 12.26s/it] 35%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2015/5680 [5:29:57<12:00:55, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.4026', 'grad_norm': '0.2617', 'learning_rate': '0.0001441', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '759.3', 'tokens/total': 16506880, 'tokens/trainable': 16319963, 'epoch': '3.006'}
 35%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2015/5680 [5:29:57<12:00:55, 11.80s/it] 35%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2016/5680 [5:30:09<12:02:34, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5129', 'grad_norm': '0.2949', 'learning_rate': '0.0001441', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '688.7', 'tokens/total': 16515072, 'tokens/trainable': 16328154, 'epoch': '3.007'}
 35%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2016/5680 [5:30:09<12:02:34, 11.83s/it] 36%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2017/5680 [5:30:21<12:16:41, 12.07s/it]                                                                                                                                                                                                                                             {'loss': '0.7069', 'grad_norm': '0.3084', 'learning_rate': '0.000144', 'ppl': '2.028', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647', 'tokens/total': 16523264, 'tokens/trainable': 16336311, 'epoch': '3.007'}
 36%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2017/5680 [5:30:21<12:16:41, 12.07s/it] 36%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2018/5680 [5:30:32<11:47:32, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.7846', 'grad_norm': '0.3039', 'learning_rate': '0.000144', 'ppl': '2.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '780.5', 'tokens/total': 16531456, 'tokens/trainable': 16344493, 'epoch': '3.007'}
 36%|███████████████████████████████████████████████████████████████████▊                                                                                                                           | 2018/5680 [5:30:32<11:47:32, 11.59s/it] 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2019/5680 [5:30:44<11:56:14, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.5334', 'grad_norm': '0.3334', 'learning_rate': '0.0001439', 'ppl': '1.705', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '675.5', 'tokens/total': 16539648, 'tokens/trainable': 16352644, 'epoch': '3.007'}
 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2019/5680 [5:30:44<11:56:14, 11.74s/it] 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2020/5680 [5:30:56<12:12:19, 12.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3795', 'grad_norm': '0.2314', 'learning_rate': '0.0001439', 'ppl': '1.462', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.9', 'tokens/total': 16547840, 'tokens/trainable': 16360795, 'epoch': '3.007'}
 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2020/5680 [5:30:56<12:12:19, 12.01s/it] 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2021/5680 [5:31:07<11:36:29, 11.42s/it]                                                                                                                                                                                                                                             {'loss': '0.5666', 'grad_norm': '0.2875', 'learning_rate': '0.0001438', 'ppl': '1.762', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '806.4', 'tokens/total': 16556032, 'tokens/trainable': 16368903, 'epoch': '3.007'}
 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2021/5680 [5:31:07<11:36:29, 11.42s/it] 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2022/5680 [5:31:19<11:55:36, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.5889', 'grad_norm': '0.2716', 'learning_rate': '0.0001438', 'ppl': '1.802', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655', 'tokens/total': 16564224, 'tokens/trainable': 16377074, 'epoch': '3.008'}
 36%|███████████████████████████████████████████████████████████████████▉                                                                                                                           | 2022/5680 [5:31:19<11:55:36, 11.74s/it] 36%|████████████████████████████████████████████████████████████████████                                                                                                                           | 2023/5680 [5:31:31<12:02:08, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5691', 'grad_norm': '0.3156', 'learning_rate': '0.0001437', 'ppl': '1.767', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '676.6', 'tokens/total': 16572416, 'tokens/trainable': 16385261, 'epoch': '3.008'}
 36%|████████████████████████████████████████████████████████████████████                                                                                                                           | 2023/5680 [5:31:31<12:02:08, 11.85s/it] 36%|████████████████████████████████████████████████████████████████████                                                                                                                           | 2024/5680 [5:31:42<11:37:35, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.6117', 'grad_norm': '0.3007', 'learning_rate': '0.0001437', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '773.5', 'tokens/total': 16580608, 'tokens/trainable': 16393386, 'epoch': '3.008'}
 36%|████████████████████████████████████████████████████████████████████                                                                                                                           | 2024/5680 [5:31:42<11:37:35, 11.45s/it] 36%|████████████████████████████████████████████████████████████████████                                                                                                                           | 2025/5680 [5:31:54<11:55:19, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.3369', 'grad_norm': '0.2563', 'learning_rate': '0.0001436', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '654', 'tokens/total': 16588800, 'tokens/trainable': 16401510, 'epoch': '3.008'}
 36%|████████████████████████████████████████████████████████████████████                                                                                                                           | 2025/5680 [5:31:54<11:55:19, 11.74s/it] 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2026/5680 [5:32:06<11:51:32, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.6434', 'grad_norm': '0.3026', 'learning_rate': '0.0001436', 'ppl': '1.903', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '705.9', 'tokens/total': 16596992, 'tokens/trainable': 16409656, 'epoch': '3.008'}
 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2026/5680 [5:32:06<11:51:32, 11.68s/it] 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2027/5680 [5:32:17<11:41:03, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.5807', 'grad_norm': '0.3989', 'learning_rate': '0.0001435', 'ppl': '1.787', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '732.7', 'tokens/total': 16605184, 'tokens/trainable': 16417796, 'epoch': '3.008'}
 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2027/5680 [5:32:17<11:41:03, 11.51s/it] 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2028/5680 [5:32:29<11:58:14, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.7459', 'grad_norm': '0.284', 'learning_rate': '0.0001435', 'ppl': '2.108', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655.8', 'tokens/total': 16613376, 'tokens/trainable': 16425968, 'epoch': '3.009'}
 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2028/5680 [5:32:29<11:58:14, 11.80s/it] 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2029/5680 [5:32:40<11:47:43, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.3432', 'grad_norm': '0.2989', 'learning_rate': '0.0001434', 'ppl': '1.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '725.6', 'tokens/total': 16621568, 'tokens/trainable': 16434116, 'epoch': '3.009'}
 36%|████████████████████████████████████████████████████████████████████▏                                                                                                                          | 2029/5680 [5:32:40<11:47:43, 11.63s/it] 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2030/5680 [5:32:52<11:43:00, 11.56s/it]                                                                                                                                                                                                                                             {'loss': '0.659', 'grad_norm': '0.3216', 'learning_rate': '0.0001434', 'ppl': '1.933', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '716.9', 'tokens/total': 16629760, 'tokens/trainable': 16442268, 'epoch': '3.009'}
 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2030/5680 [5:32:52<11:43:00, 11.56s/it] 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2031/5680 [5:33:04<12:00:33, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6525', 'grad_norm': '0.3037', 'learning_rate': '0.0001433', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649', 'tokens/total': 16637952, 'tokens/trainable': 16450395, 'epoch': '3.009'}
 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2031/5680 [5:33:04<12:00:33, 11.85s/it] 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2032/5680 [5:33:15<11:42:01, 11.55s/it]                                                                                                                                                                                                                                             {'loss': '0.4242', 'grad_norm': '0.2729', 'learning_rate': '0.0001433', 'ppl': '1.528', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '754.7', 'tokens/total': 16646144, 'tokens/trainable': 16458572, 'epoch': '3.009'}
 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2032/5680 [5:33:15<11:42:01, 11.55s/it] 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2033/5680 [5:33:27<11:46:05, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.9772', 'grad_norm': '0.3613', 'learning_rate': '0.0001432', 'ppl': '2.657', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '688.9', 'tokens/total': 16654336, 'tokens/trainable': 16466680, 'epoch': '3.01'}
 36%|████████████████████████████████████████████████████████████████████▎                                                                                                                          | 2033/5680 [5:33:27<11:46:05, 11.62s/it] 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2034/5680 [5:33:40<12:03:37, 11.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5157', 'grad_norm': '0.3231', 'learning_rate': '0.0001432', 'ppl': '1.675', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.9', 'tokens/total': 16662528, 'tokens/trainable': 16474859, 'epoch': '3.01'}
 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2034/5680 [5:33:40<12:03:37, 11.91s/it] 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2035/5680 [5:33:50<11:36:25, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.422', 'grad_norm': '0.3033', 'learning_rate': '0.0001431', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '783', 'tokens/total': 16670720, 'tokens/trainable': 16483021, 'epoch': '3.01'}
 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2035/5680 [5:33:50<11:36:25, 11.46s/it] 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2036/5680 [5:34:02<11:47:18, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.7612', 'grad_norm': '0.282', 'learning_rate': '0.0001431', 'ppl': '2.141', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.7', 'tokens/total': 16678912, 'tokens/trainable': 16491205, 'epoch': '3.01'}
 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2036/5680 [5:34:02<11:47:18, 11.65s/it] 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2037/5680 [5:34:15<12:03:39, 11.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5589', 'grad_norm': '0.2756', 'learning_rate': '0.000143', 'ppl': '1.749', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.4', 'tokens/total': 16687104, 'tokens/trainable': 16499390, 'epoch': '3.01'}
 36%|████████████████████████████████████████████████████████████████████▍                                                                                                                          | 2037/5680 [5:34:15<12:03:39, 11.92s/it] 36%|████████████████████████████████████████████████████████████████████▌                                                                                                                          | 2038/5680 [5:34:25<11:30:37, 11.38s/it]                                                                                                                                                                                                                                             {'loss': '0.676', 'grad_norm': '0.3535', 'learning_rate': '0.000143', 'ppl': '1.966', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '803.5', 'tokens/total': 16695296, 'tokens/trainable': 16507513, 'epoch': '3.01'}
 36%|████████████████████████████████████████████████████████████████████▌                                                                                                                          | 2038/5680 [5:34:25<11:30:37, 11.38s/it] 36%|████████████████████████████████████████████████████████████████████▌                                                                                                                          | 2039/5680 [5:34:37<11:50:26, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.3649', 'grad_norm': '0.2981', 'learning_rate': '0.0001429', 'ppl': '1.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '646.5', 'tokens/total': 16703488, 'tokens/trainable': 16515577, 'epoch': '3.011'}
 36%|████████████████████████████████████████████████████████████████████▌                                                                                                                          | 2039/5680 [5:34:37<11:50:26, 11.71s/it] 36%|████████████████████████████████████████████████████████████████████▌                                                                                                                          | 2040/5680 [5:34:49<11:55:48, 11.80s/it]                                                                                                                                                                                                                                             {'loss': '0.5343', 'grad_norm': '0.2584', 'learning_rate': '0.0001429', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '680.9', 'tokens/total': 16711680, 'tokens/trainable': 16523749, 'epoch': '3.011'}
 36%|████████████████████████████████████████████████████████████████████▌                                                                                                                          | 2040/5680 [5:34:49<11:55:48, 11.80s/it] 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2041/5680 [5:35:00<11:33:13, 11.43s/it]                                                                                                                                                                                                                                             {'loss': '0.8237', 'grad_norm': '0.3187', 'learning_rate': '0.0001428', 'ppl': '2.279', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '769.9', 'tokens/total': 16719872, 'tokens/trainable': 16531875, 'epoch': '3.011'}
 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2041/5680 [5:35:00<11:33:13, 11.43s/it] 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2042/5680 [5:35:12<11:51:09, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.6535', 'grad_norm': '0.2863', 'learning_rate': '0.0001428', 'ppl': '1.922', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656.5', 'tokens/total': 16728064, 'tokens/trainable': 16540030, 'epoch': '3.011'}
 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2042/5680 [5:35:12<11:51:09, 11.73s/it] 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2043/5680 [5:35:24<11:49:21, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.555', 'grad_norm': '0.267', 'learning_rate': '0.0001427', 'ppl': '1.742', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '701.2', 'tokens/total': 16736256, 'tokens/trainable': 16548190, 'epoch': '3.011'}
 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2043/5680 [5:35:24<11:49:21, 11.70s/it] 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2044/5680 [5:35:35<11:37:34, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.5757', 'grad_norm': '0.263', 'learning_rate': '0.0001427', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '738.7', 'tokens/total': 16744448, 'tokens/trainable': 16556355, 'epoch': '3.011'}
 36%|████████████████████████████████████████████████████████████████████▋                                                                                                                          | 2044/5680 [5:35:35<11:37:34, 11.51s/it] 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2045/5680 [5:35:47<11:54:18, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.431', 'grad_norm': '0.2826', 'learning_rate': '0.0001426', 'ppl': '1.539', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '658.4', 'tokens/total': 16752640, 'tokens/trainable': 16564544, 'epoch': '3.012'}
 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2045/5680 [5:35:47<11:54:18, 11.79s/it] 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2046/5680 [5:35:59<11:45:06, 11.64s/it]                                                                                                                                                                                                                                             {'loss': '0.6842', 'grad_norm': '0.3302', 'learning_rate': '0.0001426', 'ppl': '1.982', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '722.2', 'tokens/total': 16760832, 'tokens/trainable': 16572698, 'epoch': '3.012'}
 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2046/5680 [5:35:59<11:45:06, 11.64s/it] 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2047/5680 [5:36:10<11:38:11, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.5954', 'grad_norm': '0.2809', 'learning_rate': '0.0001425', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '721.8', 'tokens/total': 16769024, 'tokens/trainable': 16580826, 'epoch': '3.012'}
 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2047/5680 [5:36:10<11:38:11, 11.53s/it] 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2048/5680 [5:36:22<11:55:46, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.6544', 'grad_norm': '0.3493', 'learning_rate': '0.0001425', 'ppl': '1.924', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.4', 'tokens/total': 16777216, 'tokens/trainable': 16588945, 'epoch': '3.012'}
 36%|████████████████████████████████████████████████████████████████████▊                                                                                                                          | 2048/5680 [5:36:22<11:55:46, 11.82s/it] 36%|████████████████████████████████████████████████████████████████████▉                                                                                                                          | 2049/5680 [5:36:33<11:38:35, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.4325', 'grad_norm': '0.2883', 'learning_rate': '0.0001424', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '748.2', 'tokens/total': 16785408, 'tokens/trainable': 16597088, 'epoch': '3.012'}
 36%|████████████████████████████████████████████████████████████████████▉                                                                                                                          | 2049/5680 [5:36:33<11:38:35, 11.54s/it] 36%|████████████████████████████████████████████████████████████████████▉                                                                                                                          | 2050/5680 [5:36:45<11:41:16, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.6246', 'grad_norm': '0.3305', 'learning_rate': '0.0001424', 'ppl': '1.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '698', 'tokens/total': 16793600, 'tokens/trainable': 16605250, 'epoch': '3.013'}
 36%|████████████████████████████████████████████████████████████████████▉                                                                                                                          | 2050/5680 [5:36:45<11:41:16, 11.59s/it] 36%|████████████████████████████████████████████████████████████████████▉                                                                                                                          | 2051/5680 [5:36:58<12:01:32, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7814', 'grad_norm': '0.3288', 'learning_rate': '0.0001423', 'ppl': '2.185', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '642.2', 'tokens/total': 16801792, 'tokens/trainable': 16613415, 'epoch': '3.013'}
 36%|████████████████████████████████████████████████████████████████████▉                                                                                                                          | 2051/5680 [5:36:58<12:01:32, 11.93s/it] 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2052/5680 [5:37:08<11:33:35, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.392', 'grad_norm': '0.2608', 'learning_rate': '0.0001423', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '783.4', 'tokens/total': 16809984, 'tokens/trainable': 16621556, 'epoch': '3.013'}
 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2052/5680 [5:37:08<11:33:35, 11.47s/it] 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2053/5680 [5:37:20<11:44:05, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5718', 'grad_norm': '0.2689', 'learning_rate': '0.0001422', 'ppl': '1.772', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.3', 'tokens/total': 16818176, 'tokens/trainable': 16629729, 'epoch': '3.013'}
 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2053/5680 [5:37:20<11:44:05, 11.65s/it] 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2054/5680 [5:37:33<12:01:00, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5558', 'grad_norm': '0.2877', 'learning_rate': '0.0001422', 'ppl': '1.743', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.2', 'tokens/total': 16826368, 'tokens/trainable': 16637874, 'epoch': '3.013'}
 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2054/5680 [5:37:33<12:01:00, 11.93s/it] 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2055/5680 [5:37:44<11:45:01, 11.67s/it]                                                                                                                                                                                                                                             {'loss': '0.5484', 'grad_norm': '0.326', 'learning_rate': '0.0001421', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '738.7', 'tokens/total': 16834560, 'tokens/trainable': 16646038, 'epoch': '3.013'}
 36%|█████████████████████████████████████████████████████████████████████                                                                                                                          | 2055/5680 [5:37:44<11:45:01, 11.67s/it] 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2056/5680 [5:37:56<11:47:44, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.6219', 'grad_norm': '0.2488', 'learning_rate': '0.0001421', 'ppl': '1.863', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '691.3', 'tokens/total': 16842752, 'tokens/trainable': 16654201, 'epoch': '3.014'}
 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2056/5680 [5:37:56<11:47:44, 11.72s/it] 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2057/5680 [5:38:08<12:03:05, 11.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6665', 'grad_norm': '0.3607', 'learning_rate': '0.000142', 'ppl': '1.947', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '649.8', 'tokens/total': 16850944, 'tokens/trainable': 16662369, 'epoch': '3.014'}
 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2057/5680 [5:38:08<12:03:05, 11.97s/it] 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2058/5680 [5:38:19<11:34:09, 11.50s/it]                                                                                                                                                                                                                                             {'loss': '0.7148', 'grad_norm': '0.3473', 'learning_rate': '0.000142', 'ppl': '2.044', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '783.8', 'tokens/total': 16859136, 'tokens/trainable': 16670508, 'epoch': '3.014'}
 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2058/5680 [5:38:19<11:34:09, 11.50s/it] 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2059/5680 [5:38:31<11:46:44, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.6452', 'grad_norm': '0.2907', 'learning_rate': '0.0001419', 'ppl': '1.906', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '670.2', 'tokens/total': 16867328, 'tokens/trainable': 16678680, 'epoch': '3.014'}
 36%|█████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2059/5680 [5:38:31<11:46:44, 11.71s/it] 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2060/5680 [5:38:43<12:00:04, 11.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3849', 'grad_norm': '0.2788', 'learning_rate': '0.0001419', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '657.2', 'tokens/total': 16875520, 'tokens/trainable': 16686866, 'epoch': '3.014'}
 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2060/5680 [5:38:43<12:00:04, 11.94s/it] 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2061/5680 [5:38:54<11:29:24, 11.43s/it]                                                                                                                                                                                                                                             {'loss': '0.8212', 'grad_norm': '0.2894', 'learning_rate': '0.0001418', 'ppl': '2.273', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '798.1', 'tokens/total': 16883712, 'tokens/trainable': 16695028, 'epoch': '3.014'}
 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2061/5680 [5:38:54<11:29:24, 11.43s/it] 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2062/5680 [5:39:06<11:47:19, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.5384', 'grad_norm': '0.2618', 'learning_rate': '0.0001418', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656.6', 'tokens/total': 16891904, 'tokens/trainable': 16703189, 'epoch': '3.015'}
 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2062/5680 [5:39:06<11:47:19, 11.73s/it] 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2063/5680 [5:39:18<11:53:00, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.6988', 'grad_norm': '0.2796', 'learning_rate': '0.0001417', 'ppl': '2.011', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.2', 'tokens/total': 16900096, 'tokens/trainable': 16711363, 'epoch': '3.015'}
 36%|█████████████████████████████████████████████████████████████████████▎                                                                                                                         | 2063/5680 [5:39:18<11:53:00, 11.83s/it] 36%|█████████████████████████████████████████████████████████████████████▍                                                                                                                         | 2064/5680 [5:39:29<11:31:20, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.685', 'grad_norm': '0.331', 'learning_rate': '0.0001417', 'ppl': '1.984', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '769.6', 'tokens/total': 16908288, 'tokens/trainable': 16719534, 'epoch': '3.015'}
 36%|█████████████████████████████████████████████████████████████████████▍                                                                                                                         | 2064/5680 [5:39:29<11:31:20, 11.47s/it] 36%|█████████████████████████████████████████████████████████████████████▍                                                                                                                         | 2065/5680 [5:39:41<11:52:02, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.4904', 'grad_norm': '0.2696', 'learning_rate': '0.0001416', 'ppl': '1.633', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '642.9', 'tokens/total': 16916480, 'tokens/trainable': 16727648, 'epoch': '3.015'}
 36%|█████████████████████████████████████████████████████████████████████▍                                                                                                                         | 2065/5680 [5:39:41<11:52:02, 11.82s/it] 36%|█████████████████████████████████████████████████████████████████████▍                                                                                                                         | 2066/5680 [5:39:53<11:52:22, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.6252', 'grad_norm': '0.3251', 'learning_rate': '0.0001416', 'ppl': '1.869', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '689', 'tokens/total': 16924672, 'tokens/trainable': 16735810, 'epoch': '3.015'}
 36%|█████████████████████████████████████████████████████████████████████▍                                                                                                                         | 2066/5680 [5:39:53<11:52:22, 11.83s/it] 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2067/5680 [5:40:04<11:34:43, 11.54s/it]                                                                                                                                                                                                                                             {'loss': '0.5724', 'grad_norm': '0.2916', 'learning_rate': '0.0001415', 'ppl': '1.772', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '748.6', 'tokens/total': 16932864, 'tokens/trainable': 16743930, 'epoch': '3.015'}
 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2067/5680 [5:40:04<11:34:43, 11.54s/it] 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2068/5680 [5:40:17<11:52:38, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.8287', 'grad_norm': '0.3338', 'learning_rate': '0.0001415', 'ppl': '2.29', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.6', 'tokens/total': 16941056, 'tokens/trainable': 16752112, 'epoch': '3.016'}
 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2068/5680 [5:40:17<11:52:38, 11.84s/it] 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2069/5680 [5:40:28<11:50:38, 11.81s/it]                                                                                                                                                                                                                                             {'loss': '0.6825', 'grad_norm': '0.2891', 'learning_rate': '0.0001414', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '697.9', 'tokens/total': 16949248, 'tokens/trainable': 16760299, 'epoch': '3.016'}
 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2069/5680 [5:40:28<11:50:38, 11.81s/it] 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2070/5680 [5:40:39<11:36:15, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.5578', 'grad_norm': '0.281', 'learning_rate': '0.0001414', 'ppl': '1.747', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '741.4', 'tokens/total': 16957440, 'tokens/trainable': 16768462, 'epoch': '3.016'}
 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                         | 2070/5680 [5:40:39<11:36:15, 11.57s/it] 36%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2071/5680 [5:40:52<11:53:44, 11.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8059', 'grad_norm': '0.3006', 'learning_rate': '0.0001413', 'ppl': '2.239', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647.3', 'tokens/total': 16965632, 'tokens/trainable': 16776586, 'epoch': '3.016'}
 36%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2071/5680 [5:40:52<11:53:44, 11.87s/it] 36%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2072/5680 [5:41:03<11:41:42, 11.67s/it]                                                                                                                                                                                                                                             {'loss': '0.534', 'grad_norm': '0.2823', 'learning_rate': '0.0001413', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '725.2', 'tokens/total': 16973824, 'tokens/trainable': 16784710, 'epoch': '3.016'}
 36%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2072/5680 [5:41:03<11:41:42, 11.67s/it] 36%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2073/5680 [5:41:14<11:35:37, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.6339', 'grad_norm': '0.3518', 'learning_rate': '0.0001412', 'ppl': '1.885', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '722.3', 'tokens/total': 16982016, 'tokens/trainable': 16792892, 'epoch': '3.017'}
 36%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2073/5680 [5:41:14<11:35:37, 11.57s/it] 37%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2074/5680 [5:41:27<11:53:51, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4238', 'grad_norm': '0.2376', 'learning_rate': '0.0001412', 'ppl': '1.528', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '643.3', 'tokens/total': 16990208, 'tokens/trainable': 16800988, 'epoch': '3.017'}
 37%|█████████████████████████████████████████████████████████████████████▋                                                                                                                         | 2074/5680 [5:41:27<11:53:51, 11.88s/it] 37%|█████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2075/5680 [5:41:38<11:36:58, 11.60s/it]                                                                                                                                                                                                                                             {'loss': '0.7626', 'grad_norm': '0.343', 'learning_rate': '0.0001411', 'ppl': '2.144', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '742.7', 'tokens/total': 16998400, 'tokens/trainable': 16809120, 'epoch': '3.017'}
 37%|█████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2075/5680 [5:41:38<11:36:58, 11.60s/it] 37%|█████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2076/5680 [5:41:50<11:38:01, 11.62s/it]                                                                                                                                                                                                                                             {'loss': '0.5515', 'grad_norm': '0.3397', 'learning_rate': '0.0001411', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '697.4', 'tokens/total': 17006592, 'tokens/trainable': 16817248, 'epoch': '3.017'}
 37%|█████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2076/5680 [5:41:50<11:38:01, 11.62s/it] 37%|█████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2077/5680 [5:42:02<11:55:59, 11.92s/it]                                                                                                                                                                                                                                             {'loss': '0.7494', 'grad_norm': '0.2918', 'learning_rate': '0.000141', 'ppl': '2.116', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '648', 'tokens/total': 17014784, 'tokens/trainable': 16825428, 'epoch': '3.017'}
 37%|█████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2077/5680 [5:42:02<11:55:59, 11.92s/it] 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2078/5680 [5:42:13<11:33:47, 11.56s/it]                                                                                                                                                                                                                                             {'loss': '0.6311', 'grad_norm': '0.2877', 'learning_rate': '0.000141', 'ppl': '1.88', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '761.6', 'tokens/total': 17022976, 'tokens/trainable': 16833574, 'epoch': '3.017'}
 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2078/5680 [5:42:13<11:33:47, 11.56s/it] 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2079/5680 [5:42:25<11:38:28, 11.64s/it]                                                                                                                                                                                                                                             {'loss': '0.7544', 'grad_norm': '0.3469', 'learning_rate': '0.0001409', 'ppl': '2.126', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '689.2', 'tokens/total': 17031168, 'tokens/trainable': 16841718, 'epoch': '3.018'}
 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2079/5680 [5:42:25<11:38:28, 11.64s/it] 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2080/5680 [5:42:37<11:55:34, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6377', 'grad_norm': '0.3672', 'learning_rate': '0.0001409', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '647', 'tokens/total': 17039360, 'tokens/trainable': 16849864, 'epoch': '3.018'}
 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2080/5680 [5:42:37<11:55:34, 11.93s/it] 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2081/5680 [5:42:48<11:27:29, 11.46s/it]                                                                                                                                                                                                                                             {'loss': '0.3909', 'grad_norm': '0.2391', 'learning_rate': '0.0001408', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '788.7', 'tokens/total': 17047552, 'tokens/trainable': 16858044, 'epoch': '3.018'}
 37%|█████████████████████████████████████████████████████████████████████▉                                                                                                                         | 2081/5680 [5:42:48<11:27:29, 11.46s/it] 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2082/5680 [5:43:00<11:41:30, 11.70s/it]                                                                                                                                                                                                                                             {'loss': '0.4564', 'grad_norm': '0.2869', 'learning_rate': '0.0001408', 'ppl': '1.578', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '666.9', 'tokens/total': 17055744, 'tokens/trainable': 16866212, 'epoch': '3.018'}
 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2082/5680 [5:43:00<11:41:30, 11.70s/it] 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2083/5680 [5:43:13<11:57:16, 11.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6307', 'grad_norm': '0.331', 'learning_rate': '0.0001407', 'ppl': '1.879', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '646.6', 'tokens/total': 17063936, 'tokens/trainable': 16874344, 'epoch': '3.018'}
 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2083/5680 [5:43:13<11:57:16, 11.96s/it] 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2084/5680 [5:43:23<11:23:56, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.4491', 'grad_norm': '0.2372', 'learning_rate': '0.0001407', 'ppl': '1.567', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '806.3', 'tokens/total': 17072128, 'tokens/trainable': 16882500, 'epoch': '3.018'}
 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2084/5680 [5:43:23<11:23:56, 11.41s/it] 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2085/5680 [5:43:35<11:42:11, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.6404', 'grad_norm': '0.3124', 'learning_rate': '0.0001406', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656.6', 'tokens/total': 17080320, 'tokens/trainable': 16890664, 'epoch': '3.019'}
 37%|██████████████████████████████████████████████████████████████████████                                                                                                                         | 2085/5680 [5:43:35<11:42:11, 11.72s/it] 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2086/5680 [5:43:47<11:48:13, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.5034', 'grad_norm': '0.2964', 'learning_rate': '0.0001406', 'ppl': '1.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '675.5', 'tokens/total': 17088512, 'tokens/trainable': 16898812, 'epoch': '3.019'}
 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2086/5680 [5:43:47<11:48:13, 11.82s/it] 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2087/5680 [5:43:58<11:25:29, 11.45s/it]                                                                                                                                                                                                                                             {'loss': '0.5994', 'grad_norm': '0.2743', 'learning_rate': '0.0001405', 'ppl': '1.821', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '770.5', 'tokens/total': 17096704, 'tokens/trainable': 16906946, 'epoch': '3.019'}
 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2087/5680 [5:43:58<11:25:29, 11.45s/it] 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2088/5680 [5:44:10<11:42:24, 11.73s/it]                                                                                                                                                                                                                                             {'loss': '0.6415', 'grad_norm': '0.3202', 'learning_rate': '0.0001405', 'ppl': '1.899', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.7', 'tokens/total': 17104896, 'tokens/trainable': 16915024, 'epoch': '3.019'}
 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2088/5680 [5:44:10<11:42:24, 11.73s/it] 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2089/5680 [5:44:22<11:41:34, 11.72s/it]                                                                                                                                                                                                                                             {'loss': '0.8033', 'grad_norm': '0.3133', 'learning_rate': '0.0001404', 'ppl': '2.233', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '695.2', 'tokens/total': 17113088, 'tokens/trainable': 16923152, 'epoch': '3.019'}
 37%|██████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2089/5680 [5:44:22<11:41:34, 11.72s/it] 37%|██████████████████████████████████████████████████████████████████████▎                                                                                                                        | 2090/5680 [5:44:33<11:26:30, 11.47s/it]                                                                                                                                                                                                                                             {'loss': '0.548', 'grad_norm': '0.3287', 'learning_rate': '0.0001404', 'ppl': '1.73', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '747.7', 'tokens/total': 17121280, 'tokens/trainable': 16931288, 'epoch': '3.02'}
 37%|██████████████████████████████████████████████████████████████████████▎                                                                                                                        | 2090/5680 [5:44:33<11:26:30, 11.47s/it] 37%|██████████████████████████████████████████████████████████████████████▎                                                                                                                        | 2091/5680 [5:44:45<11:44:00, 11.77s/it]                                                                                                                                                                                                                                             {'loss': '0.9066', 'grad_norm': '0.3201', 'learning_rate': '0.0001403', 'ppl': '2.476', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '656.8', 'tokens/total': 17129472, 'tokens/trainable': 16939468, 'epoch': '3.02'}
 37%|██████████████████████████████████████████████████████████████████████▎                                                                                                                        | 2091/5680 [5:44:45<11:44:00, 11.77s/it] 37%|██████████████████████████████████████████████████████████████████████▎                                                                                                                        | 2092/5680 [5:44:57<11:36:18, 11.64s/it]                                                                                                                                                                                                                                             {'loss': '0.6136', 'grad_norm': '0.2911', 'learning_rate': '0.0001403', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '721', 'tokens/total': 17137664, 'tokens/trainable': 16947648, 'epoch': '3.02'}
 37%|██████████████████████████████████████████████████████████████████████▎                                                                                                                        | 2092/5680 [5:44:57<11:36:18, 11.64s/it] 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2093/5680 [5:45:08<11:29:25, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.5709', 'grad_norm': '0.2447', 'learning_rate': '0.0001402', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '721.7', 'tokens/total': 17145856, 'tokens/trainable': 16955772, 'epoch': '3.02'}
 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2093/5680 [5:45:08<11:29:25, 11.53s/it] 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2094/5680 [5:45:20<11:47:37, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6334', 'grad_norm': '0.3068', 'learning_rate': '0.0001402', 'ppl': '1.884', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.4', 'tokens/total': 17154048, 'tokens/trainable': 16963948, 'epoch': '3.02'}
 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2094/5680 [5:45:20<11:47:37, 11.84s/it] 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2095/5680 [5:45:32<11:36:19, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.5596', 'grad_norm': '0.3278', 'learning_rate': '0.0001401', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '727.4', 'tokens/total': 17162240, 'tokens/trainable': 16972108, 'epoch': '3.02'}
 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2095/5680 [5:45:32<11:36:19, 11.65s/it] 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2096/5680 [5:45:43<11:32:16, 11.59s/it]                                                                                                                                                                                                                                             {'loss': '0.5676', 'grad_norm': '0.3073', 'learning_rate': '0.00014', 'ppl': '1.764', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '713.9', 'tokens/total': 17170432, 'tokens/trainable': 16980266, 'epoch': '3.021'}
 37%|██████████████████████████████████████████████████████████████████████▍                                                                                                                        | 2096/5680 [5:45:43<11:32:16, 11.59s/it] 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2097/5680 [5:45:56<11:49:23, 11.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5362', 'grad_norm': '0.2787', 'learning_rate': '0.00014', 'ppl': '1.709', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '644.1', 'tokens/total': 17178624, 'tokens/trainable': 16988352, 'epoch': '3.021'}
 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2097/5680 [5:45:56<11:49:23, 11.88s/it] 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2098/5680 [5:46:07<11:46:19, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.391', 'grad_norm': '0.3647', 'learning_rate': '0.0001399', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '694.5', 'tokens/total': 17186816, 'tokens/trainable': 16996488, 'epoch': '3.021'}
 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2098/5680 [5:46:07<11:46:19, 11.83s/it] 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2099/5680 [5:46:19<11:37:04, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.6309', 'grad_norm': '0.2907', 'learning_rate': '0.0001399', 'ppl': '1.879', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '721.2', 'tokens/total': 17195008, 'tokens/trainable': 17004648, 'epoch': '3.021'}
 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2099/5680 [5:46:19<11:37:04, 11.68s/it] 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2100/5680 [5:46:31<11:51:32, 11.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4326', 'grad_norm': '0.2862', 'learning_rate': '0.0001398', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.5', 'tokens/total': 17203200, 'tokens/trainable': 17012802, 'epoch': '3.021'}
 37%|██████████████████████████████████████████████████████████████████████▌                                                                                                                        | 2100/5680 [5:46:31<11:51:32, 11.93s/it] 37%|██████████████████████████████████████████████████████████████████████▋                                                                                                                        | 2101/5680 [5:46:42<11:35:06, 11.65s/it]                                                                                                                                                                                                                                             {'loss': '0.7196', 'grad_norm': '0.3492', 'learning_rate': '0.0001398', 'ppl': '2.054', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '737.6', 'tokens/total': 17211392, 'tokens/trainable': 17020924, 'epoch': '3.021'}
 37%|██████████████████████████████████████████████████████████████████████▋                                                                                                                        | 2101/5680 [5:46:42<11:35:06, 11.65s/it] 37%|██████████████████████████████████████████████████████████████████████▋                                                                                                                        | 2102/5680 [5:46:54<11:38:22, 11.71s/it]                                                                                                                                                                                                                                             {'loss': '0.5488', 'grad_norm': '0.2933', 'learning_rate': '0.0001397', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '690.3', 'tokens/total': 17219584, 'tokens/trainable': 17029094, 'epoch': '3.022'}
 37%|██████████████████████████████████████████████████████████████████████▋                                                                                                                        | 2102/5680 [5:46:54<11:38:22, 11.71s/it] 37%|██████████████████████████████████████████████████████████████████████▋                                                                                                                        | 2103/5680 [5:47:07<11:53:02, 11.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7619', 'grad_norm': '0.3372', 'learning_rate': '0.0001397', 'ppl': '2.142', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.5', 'tokens/total': 17227776, 'tokens/trainable': 17037260, 'epoch': '3.022'}
 37%|██████████████████████████████████████████████████████████████████████▋                                                                                                                        | 2103/5680 [5:47:07<11:53:02, 11.96s/it] 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2104/5680 [5:47:17<11:32:59, 11.63s/it]                                                                                                                                                                                                                                             {'loss': '0.4122', 'grad_norm': '0.266', 'learning_rate': '0.0001396', 'ppl': '1.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '748.2', 'tokens/total': 17235968, 'tokens/trainable': 17045370, 'epoch': '3.022'}
 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2104/5680 [5:47:17<11:32:59, 11.63s/it] 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2105/5680 [5:47:29<11:35:08, 11.67s/it]                                                                                                                                                                                                                                             {'loss': '0.452', 'grad_norm': '0.2876', 'learning_rate': '0.0001396', 'ppl': '1.571', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '694.5', 'tokens/total': 17244160, 'tokens/trainable': 17053524, 'epoch': '3.022'}
 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2105/5680 [5:47:29<11:35:08, 11.67s/it] 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2106/5680 [5:47:42<11:52:09, 11.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8683', 'grad_norm': '0.3372', 'learning_rate': '0.0001395', 'ppl': '2.383', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '648.2', 'tokens/total': 17252352, 'tokens/trainable': 17061710, 'epoch': '3.022'}
 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2106/5680 [5:47:42<11:52:09, 11.96s/it] 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2107/5680 [5:47:52<11:25:11, 11.51s/it]                                                                                                                                                                                                                                             {'loss': '0.6469', 'grad_norm': '0.271', 'learning_rate': '0.0001395', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '780.3', 'tokens/total': 17260544, 'tokens/trainable': 17069864, 'epoch': '3.023'}
 37%|██████████████████████████████████████████████████████████████████████▊                                                                                                                        | 2107/5680 [5:47:52<11:25:11, 11.51s/it] 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2108/5680 [5:48:04<11:35:24, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.4852', 'grad_norm': '0.3304', 'learning_rate': '0.0001394', 'ppl': '1.625', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '675.8', 'tokens/total': 17268736, 'tokens/trainable': 17078028, 'epoch': '3.023'}
 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2108/5680 [5:48:04<11:35:24, 11.68s/it] 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2109/5680 [5:48:17<11:51:19, 11.95s/it]                                                                                                                                                                                                                                             {'loss': '0.597', 'grad_norm': '0.3256', 'learning_rate': '0.0001394', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '645.7', 'tokens/total': 17276928, 'tokens/trainable': 17086152, 'epoch': '3.023'}
 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2109/5680 [5:48:17<11:51:19, 11.95s/it] 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2110/5680 [5:48:27<11:18:50, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.6946', 'grad_norm': '0.3609', 'learning_rate': '0.0001393', 'ppl': '2.003', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '803.4', 'tokens/total': 17285120, 'tokens/trainable': 17094296, 'epoch': '3.023'}
 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2110/5680 [5:48:27<11:18:50, 11.41s/it] 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2111/5680 [5:48:40<11:38:25, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.6909', 'grad_norm': '0.2838', 'learning_rate': '0.0001393', 'ppl': '1.996', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.9', 'tokens/total': 17293312, 'tokens/trainable': 17102452, 'epoch': '3.023'}
 37%|██████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2111/5680 [5:48:40<11:38:25, 11.74s/it] 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2112/5680 [5:48:52<11:44:31, 11.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6691', 'grad_norm': '0.3072', 'learning_rate': '0.0001392', 'ppl': '1.952', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '676', 'tokens/total': 17301504, 'tokens/trainable': 17110626, 'epoch': '3.023'}
 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2112/5680 [5:48:52<11:44:31, 11.85s/it] 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2113/5680 [5:49:02<11:22:26, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.4784', 'grad_norm': '0.2913', 'learning_rate': '0.0001392', 'ppl': '1.613', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '768.2', 'tokens/total': 17309696, 'tokens/trainable': 17118776, 'epoch': '3.024'}
 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2113/5680 [5:49:02<11:22:26, 11.48s/it] 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2114/5680 [5:49:15<11:39:56, 11.78s/it]                                                                                                                                                                                                                                             {'loss': '0.8724', 'grad_norm': '0.3113', 'learning_rate': '0.0001391', 'ppl': '2.393', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655.3', 'tokens/total': 17317888, 'tokens/trainable': 17126944, 'epoch': '3.024'}
 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2114/5680 [5:49:15<11:39:56, 11.78s/it] 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2115/5680 [5:49:27<11:42:12, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.6935', 'grad_norm': '0.2802', 'learning_rate': '0.0001391', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.3', 'tokens/total': 17326080, 'tokens/trainable': 17135022, 'epoch': '3.024'}
 37%|███████████████████████████████████████████████████████████████████████                                                                                                                        | 2115/5680 [5:49:27<11:42:12, 11.82s/it] 37%|███████████████████████████████████████████████████████████████████████▏                                                                                                                       | 2116/5680 [5:49:38<11:24:51, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.8177', 'grad_norm': '0.2923', 'learning_rate': '0.000139', 'ppl': '2.265', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '751.7', 'tokens/total': 17334272, 'tokens/trainable': 17143172, 'epoch': '3.024'}
 37%|███████████████████████████████████████████████████████████████████████▏                                                                                                                       | 2116/5680 [5:49:38<11:24:51, 11.53s/it] 37%|███████████████████████████████████████████████████████████████████████▏                                                                                                                       | 2117/5680 [5:49:50<11:42:41, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.7926', 'grad_norm': '0.3968', 'learning_rate': '0.000139', 'ppl': '2.209', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '650.4', 'tokens/total': 17342464, 'tokens/trainable': 17151324, 'epoch': '3.024'}
 37%|███████████████████████████████████████████████████████████████████████▏                                                                                                                       | 2117/5680 [5:49:50<11:42:41, 11.83s/it] 37%|███████████████████████████████████████████████████████████████████████▏                                                                                                                       | 2118/5680 [5:50:02<11:42:00, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.6088', 'grad_norm': '0.3369', 'learning_rate': '0.0001389', 'ppl': '1.838', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '688', 'tokens/total': 17350656, 'tokens/trainable': 17159444, 'epoch': '3.024'}
 37%|███████████████████████████████████████████████████████████████████████▏                                                                                                                       | 2118/5680 [5:50:02<11:42:00, 11.83s/it] 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2119/5680 [5:50:13<11:26:31, 11.57s/it]                                                                                                                                                                                                                                             {'loss': '0.6357', 'grad_norm': '0.3419', 'learning_rate': '0.0001389', 'ppl': '1.888', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '742.7', 'tokens/total': 17358848, 'tokens/trainable': 17167584, 'epoch': '3.025'}
 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2119/5680 [5:50:13<11:26:31, 11.57s/it] 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2120/5680 [5:50:25<11:42:38, 11.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5888', 'grad_norm': '0.3192', 'learning_rate': '0.0001388', 'ppl': '1.802', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '655.5', 'tokens/total': 17367040, 'tokens/trainable': 17175764, 'epoch': '3.025'}
 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2120/5680 [5:50:25<11:42:38, 11.84s/it] 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2121/5680 [5:50:37<11:32:45, 11.68s/it]                                                                                                                                                                                                                                             {'loss': '0.4765', 'grad_norm': '0.25', 'learning_rate': '0.0001388', 'ppl': '1.61', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '719.5', 'tokens/total': 17375232, 'tokens/trainable': 17183888, 'epoch': '3.025'}
 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2121/5680 [5:50:37<11:32:45, 11.68s/it] 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2122/5680 [5:50:48<11:23:32, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.5342', 'grad_norm': '0.2442', 'learning_rate': '0.0001387', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '729.6', 'tokens/total': 17383424, 'tokens/trainable': 17192038, 'epoch': '3.025'}
 37%|███████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2122/5680 [5:50:48<11:23:32, 11.53s/it] 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2123/5680 [5:51:00<11:41:15, 11.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5422', 'grad_norm': '0.2692', 'learning_rate': '0.0001387', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.6', 'tokens/total': 17391616, 'tokens/trainable': 17200226, 'epoch': '3.025'}
 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2123/5680 [5:51:00<11:41:15, 11.83s/it] 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2124/5680 [5:51:11<11:21:41, 11.50s/it]                                                                                                                                                                                                                                             {'loss': '0.759', 'grad_norm': '0.3262', 'learning_rate': '0.0001386', 'ppl': '2.136', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '755.3', 'tokens/total': 17399808, 'tokens/trainable': 17208336, 'epoch': '3.026'}
 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2124/5680 [5:51:11<11:21:41, 11.50s/it] 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2125/5680 [5:51:23<11:24:58, 11.56s/it]                                                                                                                                                                                                                                             {'loss': '0.6824', 'grad_norm': '0.3221', 'learning_rate': '0.0001386', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '699.3', 'tokens/total': 17408000, 'tokens/trainable': 17216508, 'epoch': '3.026'}
 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2125/5680 [5:51:23<11:24:58, 11.56s/it] 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2126/5680 [5:51:35<11:42:30, 11.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6697', 'grad_norm': '0.2923', 'learning_rate': '0.0001385', 'ppl': '1.954', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '652.1', 'tokens/total': 17416192, 'tokens/trainable': 17224692, 'epoch': '3.026'}
 37%|███████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2126/5680 [5:51:35<11:42:30, 11.86s/it] 37%|███████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2127/5680 [5:51:46<11:16:52, 11.43s/it]                                                                                                                                                                                                                                             {'loss': '0.6127', 'grad_norm': '0.3371', 'learning_rate': '0.0001385', 'ppl': '1.845', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '780.2', 'tokens/total': 17424384, 'tokens/trainable': 17232828, 'epoch': '3.026'}
 37%|███████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2127/5680 [5:51:46<11:16:52, 11.43s/it] 37%|███████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2128/5680 [5:51:58<11:27:16, 11.61s/it]                                                                                                                                                                                                                                             {'loss': '0.5103', 'grad_norm': '0.2721', 'learning_rate': '0.0001384', 'ppl': '1.666', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '678.7', 'tokens/total': 17432576, 'tokens/trainable': 17240980, 'epoch': '3.026'}
 37%|███████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2128/5680 [5:51:58<11:27:16, 11.61s/it] 37%|███████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2129/5680 [5:52:10<11:37:54, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.7642', 'grad_norm': '0.2999', 'learning_rate': '0.0001384', 'ppl': '2.147', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '670.3', 'tokens/total': 17440768, 'tokens/trainable': 17249166, 'epoch': '3.026'}
 37%|███████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2129/5680 [5:52:10<11:37:54, 11.79s/it] 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2130/5680 [5:52:20<11:12:02, 11.36s/it]                                                                                                                                                                                                                                             {'loss': '0.8169', 'grad_norm': '0.3556', 'learning_rate': '0.0001383', 'ppl': '2.263', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '788.1', 'tokens/total': 17448960, 'tokens/trainable': 17257310, 'epoch': '3.027'}
 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2130/5680 [5:52:20<11:12:02, 11.36s/it] 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2131/5680 [5:52:33<11:31:35, 11.69s/it]                                                                                                                                                                                                                                             {'loss': '0.5078', 'grad_norm': '0.2844', 'learning_rate': '0.0001383', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '651.6', 'tokens/total': 17457152, 'tokens/trainable': 17265432, 'epoch': '3.027'}
 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2131/5680 [5:52:33<11:31:35, 11.69s/it] 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2132/5680 [5:52:45<11:36:14, 11.77s/it]                                                                                                                                                                                                                                             {'loss': '0.4569', 'grad_norm': '0.2493', 'learning_rate': '0.0001382', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '682.2', 'tokens/total': 17465344, 'tokens/trainable': 17273584, 'epoch': '3.027'}
 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2132/5680 [5:52:45<11:36:14, 11.77s/it] 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2133/5680 [5:52:56<11:18:41, 11.48s/it]                                                                                                                                                                                                                                             {'loss': '0.4636', 'grad_norm': '0.3077', 'learning_rate': '0.0001382', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '757.8', 'tokens/total': 17473536, 'tokens/trainable': 17281738, 'epoch': '3.027'}
 38%|███████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2133/5680 [5:52:56<11:18:41, 11.48s/it] 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2134/5680 [5:53:08<11:36:33, 11.79s/it]                                                                                                                                                                                                                                             {'loss': '0.6585', 'grad_norm': '0.289', 'learning_rate': '0.0001381', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '648.9', 'tokens/total': 17481728, 'tokens/trainable': 17289846, 'epoch': '3.027'}
 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2134/5680 [5:53:08<11:36:33, 11.79s/it] 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2135/5680 [5:53:20<11:33:30, 11.74s/it]                                                                                                                                                                                                                                             {'loss': '0.5641', 'grad_norm': '0.3021', 'learning_rate': '0.0001381', 'ppl': '1.758', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '695.6', 'tokens/total': 17489920, 'tokens/trainable': 17297932, 'epoch': '3.027'}
 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2135/5680 [5:53:20<11:33:30, 11.74s/it] 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2136/5680 [5:53:31<11:20:44, 11.53s/it]                                                                                                                                                                                                                                             {'loss': '0.9249', 'grad_norm': '0.3808', 'learning_rate': '0.000138', 'ppl': '2.522', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '739.3', 'tokens/total': 17498112, 'tokens/trainable': 17306078, 'epoch': '3.028'}
 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2136/5680 [5:53:31<11:20:44, 11.53s/it] 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2137/5680 [5:53:43<11:37:42, 11.82s/it]                                                                                                                                                                                                                                             {'loss': '0.6658', 'grad_norm': '0.3375', 'learning_rate': '0.000138', 'ppl': '1.946', 'memory/max_active (GiB)': '10.24', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '653.7', 'tokens/total': 17506304, 'tokens/trainable': 17314240, 'epoch': '3.028'}
 38%|███████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2137/5680 [5:53:43<11:37:42, 11.82s/it] 38%|███████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2138/5680 [5:53:54<11:27:18, 11.64s/it]                                                                                                                                                                                                                                             {'loss': '0.527', 'grad_norm': '0.2814', 'learning_rate': '0.0001379', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '726.7', 'tokens/total': 17514496, 'tokens/trainable': 17322400, 'epoch': '3.028'}
 38%|███████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2138/5680 [5:53:55<11:27:18, 11.64s/it] 38%|███████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2139/5680 [5:54:04<10:55:14, 11.10s/it]                                                                                                                                                                                                                                             {'loss': '0.5913', 'grad_norm': '0.2863', 'learning_rate': '0.0001379', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '825.5', 'tokens/total': 17522688, 'tokens/trainable': 17330512, 'epoch': '3.028'}
 38%|███████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2139/5680 [5:54:04<10:55:14, 11.10s/it] 38%|████████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2140/5680 [5:54:12<9:59:54, 10.17s/it]                                                                                                                                                                                                                                             {'loss': '0.7762', 'grad_norm': '0.367', 'learning_rate': '0.0001378', 'ppl': '2.173', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 17530880, 'tokens/trainable': 17338648, 'epoch': '3.028'}
 38%|████████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2140/5680 [5:54:12<9:59:54, 10.17s/it] 38%|████████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2141/5680 [5:54:21<9:25:54,  9.59s/it]                                                                                                                                                                                                                                             {'loss': '0.4401', 'grad_norm': '0.3109', 'learning_rate': '0.0001378', 'ppl': '1.553', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.5', 'tokens/total': 17539072, 'tokens/trainable': 17346832, 'epoch': '3.029'}
 38%|████████████████████████████████████████████████████████████████████████▎                                                                                                                       | 2141/5680 [5:54:21<9:25:54,  9.59s/it] 38%|████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2142/5680 [5:54:29<8:56:22,  9.10s/it]                                                                                                                                                                                                                                             {'loss': '0.7771', 'grad_norm': '0.3366', 'learning_rate': '0.0001377', 'ppl': '2.175', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 17547264, 'tokens/trainable': 17354984, 'epoch': '3.029'}
 38%|████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2142/5680 [5:54:29<8:56:22,  9.10s/it] 38%|████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2143/5680 [5:54:36<8:35:03,  8.74s/it]                                                                                                                                                                                                                                             {'loss': '0.6694', 'grad_norm': '0.3323', 'learning_rate': '0.0001377', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 17555456, 'tokens/trainable': 17363168, 'epoch': '3.029'}
 38%|████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2143/5680 [5:54:36<8:35:03,  8.74s/it] 38%|████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2144/5680 [5:54:44<8:20:53,  8.50s/it]                                                                                                                                                                                                                                             {'loss': '0.5028', 'grad_norm': '0.2642', 'learning_rate': '0.0001376', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 17563648, 'tokens/trainable': 17371344, 'epoch': '3.029'}
 38%|████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 2144/5680 [5:54:44<8:20:53,  8.50s/it] 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2145/5680 [5:54:52<8:10:12,  8.32s/it]                                                                                                                                                                                                                                             {'loss': '0.719', 'grad_norm': '0.3325', 'learning_rate': '0.0001376', 'ppl': '2.052', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 17571840, 'tokens/trainable': 17379500, 'epoch': '3.029'}
 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2145/5680 [5:54:52<8:10:12,  8.32s/it] 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2146/5680 [5:55:00<8:02:51,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.4782', 'grad_norm': '0.2396', 'learning_rate': '0.0001375', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 17580032, 'tokens/trainable': 17387620, 'epoch': '3.029'}
 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2146/5680 [5:55:00<8:02:51,  8.20s/it] 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2147/5680 [5:55:08<7:57:24,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.4211', 'grad_norm': '0.3076', 'learning_rate': '0.0001374', 'ppl': '1.524', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 17588224, 'tokens/trainable': 17395728, 'epoch': '3.03'}
 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2147/5680 [5:55:08<7:57:24,  8.11s/it] 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2148/5680 [5:55:16<7:52:58,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4685', 'grad_norm': '0.2729', 'learning_rate': '0.0001374', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 17596416, 'tokens/trainable': 17403866, 'epoch': '3.03'}
 38%|████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 2148/5680 [5:55:16<7:52:58,  8.03s/it] 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2149/5680 [5:55:24<7:50:21,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5283', 'grad_norm': '0.2841', 'learning_rate': '0.0001373', 'ppl': '1.696', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 17604608, 'tokens/trainable': 17412042, 'epoch': '3.03'}
 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2149/5680 [5:55:24<7:50:21,  7.99s/it] 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2150/5680 [5:55:32<7:48:40,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5776', 'grad_norm': '0.3435', 'learning_rate': '0.0001373', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 17612800, 'tokens/trainable': 17420176, 'epoch': '3.03'}
 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2150/5680 [5:55:32<7:48:40,  7.97s/it] 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2151/5680 [5:55:40<7:46:27,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.399', 'grad_norm': '0.2408', 'learning_rate': '0.0001372', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 17620992, 'tokens/trainable': 17428356, 'epoch': '3.03'}
 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2151/5680 [5:55:40<7:46:27,  7.93s/it] 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2152/5680 [5:55:47<7:45:12,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7024', 'grad_norm': '0.2975', 'learning_rate': '0.0001372', 'ppl': '2.019', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 17629184, 'tokens/trainable': 17436470, 'epoch': '3.03'}
 38%|████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 2152/5680 [5:55:47<7:45:12,  7.91s/it] 38%|████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2153/5680 [5:55:55<7:43:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.742', 'grad_norm': '0.3679', 'learning_rate': '0.0001371', 'ppl': '2.1', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 17637376, 'tokens/trainable': 17444634, 'epoch': '3.031'}
 38%|████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2153/5680 [5:55:55<7:43:50,  7.89s/it] 38%|████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2154/5680 [5:56:03<7:42:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6426', 'grad_norm': '0.3332', 'learning_rate': '0.0001371', 'ppl': '1.901', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 17645568, 'tokens/trainable': 17452808, 'epoch': '3.031'}
 38%|████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2154/5680 [5:56:03<7:42:57,  7.88s/it] 38%|████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2155/5680 [5:56:11<7:42:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6999', 'grad_norm': '0.3784', 'learning_rate': '0.000137', 'ppl': '2.014', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 17653760, 'tokens/trainable': 17460946, 'epoch': '3.031'}
 38%|████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 2155/5680 [5:56:11<7:42:06,  7.87s/it] 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2156/5680 [5:56:19<7:41:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5094', 'grad_norm': '0.3015', 'learning_rate': '0.000137', 'ppl': '1.664', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 17661952, 'tokens/trainable': 17469104, 'epoch': '3.031'}
 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2156/5680 [5:56:19<7:41:54,  7.86s/it] 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2157/5680 [5:56:27<7:46:45,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4789', 'grad_norm': '0.2879', 'learning_rate': '0.0001369', 'ppl': '1.614', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 17670144, 'tokens/trainable': 17477260, 'epoch': '3.031'}
 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2157/5680 [5:56:27<7:46:45,  7.95s/it] 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2158/5680 [5:56:35<7:44:53,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6083', 'grad_norm': '0.302', 'learning_rate': '0.0001369', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 17678336, 'tokens/trainable': 17485370, 'epoch': '3.032'}
 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2158/5680 [5:56:35<7:44:53,  7.92s/it] 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2159/5680 [5:56:43<7:44:16,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4531', 'grad_norm': '0.2761', 'learning_rate': '0.0001368', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 17686528, 'tokens/trainable': 17493518, 'epoch': '3.032'}
 38%|████████████████████████████████████████████████████████████████████████▉                                                                                                                       | 2159/5680 [5:56:43<7:44:16,  7.91s/it] 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2160/5680 [5:56:51<7:43:43,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6456', 'grad_norm': '0.339', 'learning_rate': '0.0001368', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 17694720, 'tokens/trainable': 17501692, 'epoch': '3.032'}
 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2160/5680 [5:56:51<7:43:43,  7.90s/it] 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2161/5680 [5:56:58<7:42:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7829', 'grad_norm': '0.3589', 'learning_rate': '0.0001367', 'ppl': '2.188', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 17702912, 'tokens/trainable': 17509804, 'epoch': '3.032'}
 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2161/5680 [5:56:58<7:42:44,  7.89s/it] 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2162/5680 [5:57:06<7:42:27,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5908', 'grad_norm': '0.2651', 'learning_rate': '0.0001367', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 17711104, 'tokens/trainable': 17517892, 'epoch': '3.032'}
 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2162/5680 [5:57:06<7:42:27,  7.89s/it] 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2163/5680 [5:57:14<7:41:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9345', 'grad_norm': '0.3614', 'learning_rate': '0.0001366', 'ppl': '2.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 17719296, 'tokens/trainable': 17525996, 'epoch': '3.032'}
 38%|█████████████████████████████████████████████████████████████████████████                                                                                                                       | 2163/5680 [5:57:14<7:41:26,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 2164/5680 [5:57:22<7:41:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5287', 'grad_norm': '0.2683', 'learning_rate': '0.0001366', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 17727488, 'tokens/trainable': 17534140, 'epoch': '3.033'}
 38%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 2164/5680 [5:57:22<7:41:59,  7.88s/it] 38%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 2165/5680 [5:57:30<7:41:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6776', 'grad_norm': '0.2846', 'learning_rate': '0.0001365', 'ppl': '1.969', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 17735680, 'tokens/trainable': 17542320, 'epoch': '3.033'}
 38%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 2165/5680 [5:57:30<7:41:14,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 2166/5680 [5:57:38<7:41:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6521', 'grad_norm': '0.2994', 'learning_rate': '0.0001365', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 17743872, 'tokens/trainable': 17550508, 'epoch': '3.033'}
 38%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 2166/5680 [5:57:38<7:41:50,  7.89s/it] 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2167/5680 [5:57:46<7:41:17,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5234', 'grad_norm': '0.2663', 'learning_rate': '0.0001364', 'ppl': '1.688', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 17752064, 'tokens/trainable': 17558638, 'epoch': '3.033'}
 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2167/5680 [5:57:46<7:41:17,  7.88s/it] 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2168/5680 [5:57:54<7:41:26,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5858', 'grad_norm': '0.2714', 'learning_rate': '0.0001364', 'ppl': '1.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 17760256, 'tokens/trainable': 17566788, 'epoch': '3.033'}
 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2168/5680 [5:57:54<7:41:26,  7.88s/it] 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2169/5680 [5:58:02<7:41:29,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5211', 'grad_norm': '0.31', 'learning_rate': '0.0001363', 'ppl': '1.684', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 17768448, 'tokens/trainable': 17574958, 'epoch': '3.033'}
 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2169/5680 [5:58:02<7:41:29,  7.89s/it] 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2170/5680 [5:58:09<7:40:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6799', 'grad_norm': '0.2975', 'learning_rate': '0.0001363', 'ppl': '1.974', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 17776640, 'tokens/trainable': 17583116, 'epoch': '3.034'}
 38%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 2170/5680 [5:58:09<7:40:40,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2171/5680 [5:58:17<7:40:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4457', 'grad_norm': '0.263', 'learning_rate': '0.0001362', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 17784832, 'tokens/trainable': 17591276, 'epoch': '3.034'}
 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2171/5680 [5:58:17<7:40:19,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2172/5680 [5:58:25<7:40:39,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5446', 'grad_norm': '0.3167', 'learning_rate': '0.0001362', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 17793024, 'tokens/trainable': 17599372, 'epoch': '3.034'}
 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2172/5680 [5:58:25<7:40:39,  7.88s/it] 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2173/5680 [5:58:33<7:40:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7197', 'grad_norm': '0.3317', 'learning_rate': '0.0001361', 'ppl': '2.054', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 17801216, 'tokens/trainable': 17607560, 'epoch': '3.034'}
 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2173/5680 [5:58:33<7:40:13,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2174/5680 [5:58:41<7:39:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6681', 'grad_norm': '0.276', 'learning_rate': '0.0001361', 'ppl': '1.951', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 17809408, 'tokens/trainable': 17615708, 'epoch': '3.034'}
 38%|█████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 2174/5680 [5:58:41<7:39:22,  7.86s/it] 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2175/5680 [5:58:49<7:39:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5083', 'grad_norm': '0.2758', 'learning_rate': '0.000136', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 17817600, 'tokens/trainable': 17623896, 'epoch': '3.035'}
 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2175/5680 [5:58:49<7:39:02,  7.86s/it] 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2176/5680 [5:58:57<7:39:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6778', 'grad_norm': '0.3656', 'learning_rate': '0.000136', 'ppl': '1.97', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 17825792, 'tokens/trainable': 17632014, 'epoch': '3.035'}
 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2176/5680 [5:58:57<7:39:14,  7.86s/it] 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2177/5680 [5:59:04<7:39:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7115', 'grad_norm': '0.3019', 'learning_rate': '0.0001359', 'ppl': '2.037', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 17833984, 'tokens/trainable': 17640196, 'epoch': '3.035'}
 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2177/5680 [5:59:04<7:39:37,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2178/5680 [5:59:12<7:39:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5161', 'grad_norm': '0.3117', 'learning_rate': '0.0001359', 'ppl': '1.675', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 17842176, 'tokens/trainable': 17648388, 'epoch': '3.035'}
 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 2178/5680 [5:59:12<7:39:38,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 2179/5680 [5:59:20<7:38:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7399', 'grad_norm': '0.3343', 'learning_rate': '0.0001358', 'ppl': '2.096', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 17850368, 'tokens/trainable': 17656558, 'epoch': '3.035'}
 38%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 2179/5680 [5:59:20<7:38:44,  7.86s/it] 38%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 2180/5680 [5:59:28<7:39:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3698', 'grad_norm': '0.2954', 'learning_rate': '0.0001358', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 17858560, 'tokens/trainable': 17664676, 'epoch': '3.035'}
 38%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 2180/5680 [5:59:28<7:39:11,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 2181/5680 [5:59:36<7:38:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7386', 'grad_norm': '0.3981', 'learning_rate': '0.0001357', 'ppl': '2.093', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 17866752, 'tokens/trainable': 17672812, 'epoch': '3.036'}
 38%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 2181/5680 [5:59:36<7:38:43,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2182/5680 [5:59:44<7:38:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5103', 'grad_norm': '0.3193', 'learning_rate': '0.0001356', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 17874944, 'tokens/trainable': 17680976, 'epoch': '3.036'}
 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2182/5680 [5:59:44<7:38:13,  7.86s/it] 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2183/5680 [5:59:52<7:38:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7054', 'grad_norm': '0.3444', 'learning_rate': '0.0001356', 'ppl': '2.025', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 17883136, 'tokens/trainable': 17689080, 'epoch': '3.036'}
 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2183/5680 [5:59:52<7:38:24,  7.87s/it] 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2184/5680 [6:00:00<7:43:37,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6537', 'grad_norm': '0.3309', 'learning_rate': '0.0001355', 'ppl': '1.923', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.2', 'tokens/total': 17891328, 'tokens/trainable': 17697218, 'epoch': '3.036'}
 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2184/5680 [6:00:00<7:43:37,  7.96s/it] 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2185/5680 [6:00:08<7:42:11,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6514', 'grad_norm': '0.3169', 'learning_rate': '0.0001355', 'ppl': '1.918', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 17899520, 'tokens/trainable': 17705364, 'epoch': '3.036'}
 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 2185/5680 [6:00:08<7:42:11,  7.93s/it] 38%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2186/5680 [6:00:16<7:40:44,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.8826', 'grad_norm': '0.3696', 'learning_rate': '0.0001354', 'ppl': '2.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 17907712, 'tokens/trainable': 17713536, 'epoch': '3.036'}
 38%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2186/5680 [6:00:16<7:40:44,  7.91s/it] 39%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2187/5680 [6:00:23<7:39:50,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.507', 'grad_norm': '0.3234', 'learning_rate': '0.0001354', 'ppl': '1.66', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 17915904, 'tokens/trainable': 17721658, 'epoch': '3.037'}
 39%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2187/5680 [6:00:23<7:39:50,  7.90s/it] 39%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2188/5680 [6:00:31<7:38:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6941', 'grad_norm': '0.3155', 'learning_rate': '0.0001353', 'ppl': '2.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 17924096, 'tokens/trainable': 17729838, 'epoch': '3.037'}
 39%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2188/5680 [6:00:31<7:38:49,  7.88s/it] 39%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2189/5680 [6:00:39<7:38:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.634', 'grad_norm': '0.3148', 'learning_rate': '0.0001353', 'ppl': '1.885', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 17932288, 'tokens/trainable': 17737968, 'epoch': '3.037'}
 39%|█████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 2189/5680 [6:00:39<7:38:25,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████                                                                                                                      | 2190/5680 [6:00:47<7:38:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5101', 'grad_norm': '0.3078', 'learning_rate': '0.0001352', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 17940480, 'tokens/trainable': 17746096, 'epoch': '3.037'}
 39%|██████████████████████████████████████████████████████████████████████████                                                                                                                      | 2190/5680 [6:00:47<7:38:30,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████                                                                                                                      | 2191/5680 [6:00:55<7:38:33,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6246', 'grad_norm': '0.3381', 'learning_rate': '0.0001352', 'ppl': '1.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 17948672, 'tokens/trainable': 17754272, 'epoch': '3.037'}
 39%|██████████████████████████████████████████████████████████████████████████                                                                                                                      | 2191/5680 [6:00:55<7:38:33,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████                                                                                                                      | 2192/5680 [6:01:03<7:37:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6502', 'grad_norm': '0.3095', 'learning_rate': '0.0001351', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 17956864, 'tokens/trainable': 17762456, 'epoch': '3.038'}
 39%|██████████████████████████████████████████████████████████████████████████                                                                                                                      | 2192/5680 [6:01:03<7:37:34,  7.87s/it] 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2193/5680 [6:01:11<7:37:43,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5507', 'grad_norm': '0.3812', 'learning_rate': '0.0001351', 'ppl': '1.734', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 17965056, 'tokens/trainable': 17770616, 'epoch': '3.038'}
 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2193/5680 [6:01:11<7:37:43,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2194/5680 [6:01:19<7:37:50,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7472', 'grad_norm': '0.3057', 'learning_rate': '0.000135', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 17973248, 'tokens/trainable': 17778768, 'epoch': '3.038'}
 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2194/5680 [6:01:19<7:37:50,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2195/5680 [6:01:26<7:37:51,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4349', 'grad_norm': '0.2774', 'learning_rate': '0.000135', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 17981440, 'tokens/trainable': 17786888, 'epoch': '3.038'}
 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2195/5680 [6:01:26<7:37:51,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2196/5680 [6:01:34<7:37:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5995', 'grad_norm': '0.2791', 'learning_rate': '0.0001349', 'ppl': '1.821', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 17989632, 'tokens/trainable': 17795000, 'epoch': '3.038'}
 39%|██████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 2196/5680 [6:01:34<7:37:41,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2197/5680 [6:01:42<7:36:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6892', 'grad_norm': '0.3495', 'learning_rate': '0.0001349', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 17997824, 'tokens/trainable': 17803186, 'epoch': '3.038'}
 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2197/5680 [6:01:42<7:36:29,  7.86s/it] 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2198/5680 [6:01:50<7:36:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4267', 'grad_norm': '0.2383', 'learning_rate': '0.0001348', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 18006016, 'tokens/trainable': 17811356, 'epoch': '3.039'}
 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2198/5680 [6:01:50<7:36:00,  7.86s/it] 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2199/5680 [6:01:58<7:36:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5559', 'grad_norm': '0.2707', 'learning_rate': '0.0001348', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18014208, 'tokens/trainable': 17819520, 'epoch': '3.039'}
 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2199/5680 [6:01:58<7:36:03,  7.86s/it] 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2200/5680 [6:02:06<7:36:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5465', 'grad_norm': '0.2898', 'learning_rate': '0.0001347', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18022400, 'tokens/trainable': 17827672, 'epoch': '3.039'}
 39%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 2200/5680 [6:02:06<7:36:36,  7.87s/it] 39%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 2201/5680 [6:02:14<7:36:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.705', 'grad_norm': '0.3525', 'learning_rate': '0.0001347', 'ppl': '2.024', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18030592, 'tokens/trainable': 17835828, 'epoch': '3.039'}
 39%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 2201/5680 [6:02:14<7:36:29,  7.87s/it] 39%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 2202/5680 [6:02:21<7:36:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6113', 'grad_norm': '0.2962', 'learning_rate': '0.0001346', 'ppl': '1.843', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18038784, 'tokens/trainable': 17843988, 'epoch': '3.039'}
 39%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 2202/5680 [6:02:21<7:36:23,  7.87s/it] 39%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 2203/5680 [6:02:29<7:36:46,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6777', 'grad_norm': '0.3598', 'learning_rate': '0.0001346', 'ppl': '1.969', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 18046976, 'tokens/trainable': 17852124, 'epoch': '3.039'}
 39%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 2203/5680 [6:02:29<7:36:46,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2204/5680 [6:02:37<7:36:38,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.694', 'grad_norm': '0.3095', 'learning_rate': '0.0001345', 'ppl': '2.002', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18055168, 'tokens/trainable': 17860292, 'epoch': '3.04'}
 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2204/5680 [6:02:37<7:36:38,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2205/5680 [6:02:45<7:36:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5921', 'grad_norm': '0.3399', 'learning_rate': '0.0001345', 'ppl': '1.808', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18063360, 'tokens/trainable': 17868452, 'epoch': '3.04'}
 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2205/5680 [6:02:45<7:36:50,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2206/5680 [6:02:53<7:36:56,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8084', 'grad_norm': '0.3946', 'learning_rate': '0.0001344', 'ppl': '2.244', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 18071552, 'tokens/trainable': 17876560, 'epoch': '3.04'}
 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2206/5680 [6:02:53<7:36:56,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2207/5680 [6:03:01<7:36:43,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6172', 'grad_norm': '0.3417', 'learning_rate': '0.0001344', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18079744, 'tokens/trainable': 17884716, 'epoch': '3.04'}
 39%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 2207/5680 [6:03:01<7:36:43,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2208/5680 [6:03:09<7:36:39,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4507', 'grad_norm': '0.2486', 'learning_rate': '0.0001343', 'ppl': '1.569', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18087936, 'tokens/trainable': 17892876, 'epoch': '3.04'}
 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2208/5680 [6:03:09<7:36:39,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2209/5680 [6:03:17<7:36:43,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6503', 'grad_norm': '0.3382', 'learning_rate': '0.0001342', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 18096128, 'tokens/trainable': 17901020, 'epoch': '3.04'}
 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2209/5680 [6:03:17<7:36:43,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2210/5680 [6:03:25<7:37:11,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5144', 'grad_norm': '0.2885', 'learning_rate': '0.0001342', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 18104320, 'tokens/trainable': 17909174, 'epoch': '3.041'}
 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2210/5680 [6:03:25<7:37:11,  7.91s/it] 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2211/5680 [6:03:33<7:37:00,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5097', 'grad_norm': '0.2433', 'learning_rate': '0.0001341', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 18112512, 'tokens/trainable': 17917272, 'epoch': '3.041'}
 39%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                     | 2211/5680 [6:03:33<7:37:00,  7.90s/it] 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2212/5680 [6:03:40<7:36:44,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6034', 'grad_norm': '0.2805', 'learning_rate': '0.0001341', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 18120704, 'tokens/trainable': 17925400, 'epoch': '3.041'}
 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2212/5680 [6:03:40<7:36:44,  7.90s/it] 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2213/5680 [6:03:48<7:35:12,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6933', 'grad_norm': '0.3057', 'learning_rate': '0.000134', 'ppl': '2', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18128896, 'tokens/trainable': 17933528, 'epoch': '3.041'}
 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2213/5680 [6:03:48<7:35:12,  7.88s/it] 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2214/5680 [6:03:56<7:35:38,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4943', 'grad_norm': '0.3136', 'learning_rate': '0.000134', 'ppl': '1.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 18137088, 'tokens/trainable': 17941662, 'epoch': '3.041'}
 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2214/5680 [6:03:56<7:35:38,  7.89s/it] 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2215/5680 [6:04:04<7:34:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9193', 'grad_norm': '0.3398', 'learning_rate': '0.0001339', 'ppl': '2.507', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 18145280, 'tokens/trainable': 17949818, 'epoch': '3.042'}
 39%|██████████████████████████████████████████████████████████████████████████▊                                                                                                                     | 2215/5680 [6:04:04<7:34:17,  7.87s/it] 39%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 2216/5680 [6:04:12<7:39:31,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4976', 'grad_norm': '0.2387', 'learning_rate': '0.0001339', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.8', 'tokens/total': 18153472, 'tokens/trainable': 17957974, 'epoch': '3.042'}
 39%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 2216/5680 [6:04:12<7:39:31,  7.96s/it] 39%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 2217/5680 [6:04:20<7:37:53,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6238', 'grad_norm': '0.39', 'learning_rate': '0.0001338', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18161664, 'tokens/trainable': 17966100, 'epoch': '3.042'}
 39%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 2217/5680 [6:04:20<7:37:53,  7.93s/it] 39%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 2218/5680 [6:04:28<7:36:38,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5564', 'grad_norm': '0.2706', 'learning_rate': '0.0001338', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18169856, 'tokens/trainable': 17974274, 'epoch': '3.042'}
 39%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 2218/5680 [6:04:28<7:36:38,  7.91s/it] 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2219/5680 [6:04:36<7:35:30,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7372', 'grad_norm': '0.302', 'learning_rate': '0.0001337', 'ppl': '2.09', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 18178048, 'tokens/trainable': 17982448, 'epoch': '3.042'}
 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2219/5680 [6:04:36<7:35:30,  7.90s/it] 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2220/5680 [6:04:44<7:34:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7164', 'grad_norm': '0.3467', 'learning_rate': '0.0001337', 'ppl': '2.047', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18186240, 'tokens/trainable': 17990592, 'epoch': '3.042'}
 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2220/5680 [6:04:44<7:34:21,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2221/5680 [6:04:51<7:34:01,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7066', 'grad_norm': '0.3148', 'learning_rate': '0.0001336', 'ppl': '2.027', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18194432, 'tokens/trainable': 17998742, 'epoch': '3.043'}
 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2221/5680 [6:04:51<7:34:01,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2222/5680 [6:04:59<7:33:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7007', 'grad_norm': '0.3413', 'learning_rate': '0.0001336', 'ppl': '2.015', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18202624, 'tokens/trainable': 18006916, 'epoch': '3.043'}
 39%|███████████████████████████████████████████████████████████████████████████                                                                                                                     | 2222/5680 [6:04:59<7:33:50,  7.87s/it] 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2223/5680 [6:05:07<7:33:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6051', 'grad_norm': '0.4096', 'learning_rate': '0.0001335', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18210816, 'tokens/trainable': 18015106, 'epoch': '3.043'}
 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2223/5680 [6:05:07<7:33:52,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2224/5680 [6:05:15<7:34:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6586', 'grad_norm': '0.3001', 'learning_rate': '0.0001335', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18219008, 'tokens/trainable': 18023268, 'epoch': '3.043'}
 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2224/5680 [6:05:15<7:34:00,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2225/5680 [6:05:23<7:33:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6133', 'grad_norm': '0.3155', 'learning_rate': '0.0001334', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18227200, 'tokens/trainable': 18031456, 'epoch': '3.043'}
 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2225/5680 [6:05:23<7:33:56,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2226/5680 [6:05:31<7:34:04,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8298', 'grad_norm': '0.3003', 'learning_rate': '0.0001334', 'ppl': '2.293', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18235392, 'tokens/trainable': 18039632, 'epoch': '3.043'}
 39%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 2226/5680 [6:05:31<7:34:04,  7.89s/it] 39%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 2227/5680 [6:05:39<7:38:17,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6699', 'grad_norm': '0.2944', 'learning_rate': '0.0001333', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 18243584, 'tokens/trainable': 18047792, 'epoch': '3.044'}
 39%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 2227/5680 [6:05:39<7:38:17,  7.96s/it] 39%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 2228/5680 [6:05:47<7:36:29,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6283', 'grad_norm': '0.3089', 'learning_rate': '0.0001333', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18251776, 'tokens/trainable': 18055948, 'epoch': '3.044'}
 39%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 2228/5680 [6:05:47<7:36:29,  7.93s/it] 39%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 2229/5680 [6:05:55<7:35:37,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5521', 'grad_norm': '0.2882', 'learning_rate': '0.0001332', 'ppl': '1.737', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 18259968, 'tokens/trainable': 18064080, 'epoch': '3.044'}
 39%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 2229/5680 [6:05:55<7:35:37,  7.92s/it] 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2230/5680 [6:06:03<7:34:44,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5449', 'grad_norm': '0.281', 'learning_rate': '0.0001332', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18268160, 'tokens/trainable': 18072270, 'epoch': '3.044'}
 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2230/5680 [6:06:03<7:34:44,  7.91s/it] 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2231/5680 [6:06:11<7:34:00,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3848', 'grad_norm': '0.2497', 'learning_rate': '0.0001331', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18276352, 'tokens/trainable': 18080412, 'epoch': '3.044'}
 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2231/5680 [6:06:11<7:34:00,  7.90s/it] 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2232/5680 [6:06:18<7:34:09,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6019', 'grad_norm': '0.3057', 'learning_rate': '0.0001331', 'ppl': '1.825', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18284544, 'tokens/trainable': 18088588, 'epoch': '3.045'}
 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2232/5680 [6:06:18<7:34:09,  7.90s/it] 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2233/5680 [6:06:26<7:33:36,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4594', 'grad_norm': '0.3611', 'learning_rate': '0.000133', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18292736, 'tokens/trainable': 18096764, 'epoch': '3.045'}
 39%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 2233/5680 [6:06:26<7:33:36,  7.90s/it] 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2234/5680 [6:06:34<7:33:12,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8093', 'grad_norm': '0.3022', 'learning_rate': '0.0001329', 'ppl': '2.246', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18300928, 'tokens/trainable': 18104904, 'epoch': '3.045'}
 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2234/5680 [6:06:34<7:33:12,  7.89s/it] 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2235/5680 [6:06:42<7:32:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.495', 'grad_norm': '0.2607', 'learning_rate': '0.0001329', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18309120, 'tokens/trainable': 18113032, 'epoch': '3.045'}
 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2235/5680 [6:06:42<7:32:29,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2236/5680 [6:06:50<7:31:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5896', 'grad_norm': '0.3763', 'learning_rate': '0.0001328', 'ppl': '1.803', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18317312, 'tokens/trainable': 18121152, 'epoch': '3.045'}
 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2236/5680 [6:06:50<7:31:48,  7.87s/it] 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2237/5680 [6:06:58<7:31:54,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6106', 'grad_norm': '0.3459', 'learning_rate': '0.0001328', 'ppl': '1.841', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 18325504, 'tokens/trainable': 18129246, 'epoch': '3.045'}
 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                    | 2237/5680 [6:06:58<7:31:54,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 2238/5680 [6:07:06<7:31:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7883', 'grad_norm': '0.291', 'learning_rate': '0.0001327', 'ppl': '2.2', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18333696, 'tokens/trainable': 18137408, 'epoch': '3.046'}
 39%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 2238/5680 [6:07:06<7:31:28,  7.87s/it] 39%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 2239/5680 [6:07:14<7:32:03,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6233', 'grad_norm': '0.2902', 'learning_rate': '0.0001327', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 18341888, 'tokens/trainable': 18145554, 'epoch': '3.046'}
 39%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 2239/5680 [6:07:14<7:32:03,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 2240/5680 [6:07:22<7:32:35,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7023', 'grad_norm': '0.3358', 'learning_rate': '0.0001326', 'ppl': '2.018', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 18350080, 'tokens/trainable': 18153688, 'epoch': '3.046'}
 39%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 2240/5680 [6:07:22<7:32:35,  7.89s/it] 39%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2241/5680 [6:07:29<7:31:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3875', 'grad_norm': '0.288', 'learning_rate': '0.0001326', 'ppl': '1.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 18358272, 'tokens/trainable': 18161876, 'epoch': '3.046'}
 39%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2241/5680 [6:07:29<7:31:56,  7.88s/it] 39%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2242/5680 [6:07:37<7:31:59,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.9747', 'grad_norm': '0.3516', 'learning_rate': '0.0001325', 'ppl': '2.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18366464, 'tokens/trainable': 18170022, 'epoch': '3.046'}
 39%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2242/5680 [6:07:37<7:31:59,  7.89s/it] 39%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2243/5680 [6:07:45<7:31:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5102', 'grad_norm': '0.3158', 'learning_rate': '0.0001325', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18374656, 'tokens/trainable': 18178184, 'epoch': '3.046'}
 39%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2243/5680 [6:07:45<7:31:11,  7.88s/it] 40%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2244/5680 [6:07:53<7:31:12,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4141', 'grad_norm': '0.2824', 'learning_rate': '0.0001324', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18382848, 'tokens/trainable': 18186352, 'epoch': '3.047'}
 40%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 2244/5680 [6:07:53<7:31:12,  7.88s/it] 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2245/5680 [6:08:01<7:31:03,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3829', 'grad_norm': '0.2648', 'learning_rate': '0.0001324', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18391040, 'tokens/trainable': 18194520, 'epoch': '3.047'}
 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2245/5680 [6:08:01<7:31:03,  7.88s/it] 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2246/5680 [6:08:09<7:30:50,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3836', 'grad_norm': '0.2799', 'learning_rate': '0.0001323', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18399232, 'tokens/trainable': 18202672, 'epoch': '3.047'}
 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2246/5680 [6:08:09<7:30:50,  7.88s/it] 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2247/5680 [6:08:17<7:30:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5467', 'grad_norm': '0.2601', 'learning_rate': '0.0001323', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18407424, 'tokens/trainable': 18210776, 'epoch': '3.047'}
 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2247/5680 [6:08:17<7:30:21,  7.87s/it] 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2248/5680 [6:08:25<7:30:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.522', 'grad_norm': '0.3484', 'learning_rate': '0.0001322', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18415616, 'tokens/trainable': 18218904, 'epoch': '3.047'}
 40%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                    | 2248/5680 [6:08:25<7:30:08,  7.87s/it] 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2249/5680 [6:08:32<7:29:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6111', 'grad_norm': '0.3078', 'learning_rate': '0.0001322', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18423808, 'tokens/trainable': 18227020, 'epoch': '3.048'}
 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2249/5680 [6:08:32<7:29:39,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2250/5680 [6:08:40<7:30:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6842', 'grad_norm': '0.3222', 'learning_rate': '0.0001321', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18432000, 'tokens/trainable': 18235190, 'epoch': '3.048'}
 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2250/5680 [6:08:40<7:30:06,  7.87s/it] 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2251/5680 [6:08:48<7:29:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9669', 'grad_norm': '0.3377', 'learning_rate': '0.0001321', 'ppl': '2.63', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 18440192, 'tokens/trainable': 18243362, 'epoch': '3.048'}
 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2251/5680 [6:08:48<7:29:39,  7.87s/it] 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2252/5680 [6:08:56<7:29:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6577', 'grad_norm': '0.3497', 'learning_rate': '0.000132', 'ppl': '1.93', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 18448384, 'tokens/trainable': 18251452, 'epoch': '3.048'}
 40%|████████████████████████████████████████████████████████████████████████████                                                                                                                    | 2252/5680 [6:08:56<7:29:16,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 2253/5680 [6:09:04<7:29:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7348', 'grad_norm': '0.3523', 'learning_rate': '0.000132', 'ppl': '2.085', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18456576, 'tokens/trainable': 18259634, 'epoch': '3.048'}
 40%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 2253/5680 [6:09:04<7:29:11,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 2254/5680 [6:09:12<7:29:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5322', 'grad_norm': '0.3115', 'learning_rate': '0.0001319', 'ppl': '1.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 18464768, 'tokens/trainable': 18267760, 'epoch': '3.048'}
 40%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 2254/5680 [6:09:12<7:29:40,  7.88s/it] 40%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 2255/5680 [6:09:20<7:29:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.44', 'grad_norm': '0.2534', 'learning_rate': '0.0001318', 'ppl': '1.553', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 18472960, 'tokens/trainable': 18275902, 'epoch': '3.049'}
 40%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 2255/5680 [6:09:20<7:29:53,  7.88s/it] 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2256/5680 [6:09:27<7:29:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7473', 'grad_norm': '0.3509', 'learning_rate': '0.0001318', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18481152, 'tokens/trainable': 18284050, 'epoch': '3.049'}
 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2256/5680 [6:09:27<7:29:35,  7.88s/it] 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2257/5680 [6:09:35<7:28:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5693', 'grad_norm': '0.3101', 'learning_rate': '0.0001317', 'ppl': '1.767', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 18489344, 'tokens/trainable': 18292222, 'epoch': '3.049'}
 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2257/5680 [6:09:35<7:28:44,  7.87s/it] 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2258/5680 [6:09:43<7:28:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8668', 'grad_norm': '0.3904', 'learning_rate': '0.0001317', 'ppl': '2.379', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 18497536, 'tokens/trainable': 18300394, 'epoch': '3.049'}
 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2258/5680 [6:09:43<7:28:18,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2259/5680 [6:09:51<7:27:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6876', 'grad_norm': '0.2759', 'learning_rate': '0.0001316', 'ppl': '1.989', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 18505728, 'tokens/trainable': 18308560, 'epoch': '3.049'}
 40%|████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 2259/5680 [6:09:51<7:27:37,  7.85s/it] 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2260/5680 [6:09:59<7:27:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8363', 'grad_norm': '0.35', 'learning_rate': '0.0001316', 'ppl': '2.308', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18513920, 'tokens/trainable': 18316744, 'epoch': '3.049'}
 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2260/5680 [6:09:59<7:27:53,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2261/5680 [6:10:07<7:27:24,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4108', 'grad_norm': '0.2571', 'learning_rate': '0.0001315', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 18522112, 'tokens/trainable': 18324920, 'epoch': '3.05'}
 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2261/5680 [6:10:07<7:27:24,  7.85s/it] 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2262/5680 [6:10:15<7:26:53,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6727', 'grad_norm': '0.3394', 'learning_rate': '0.0001315', 'ppl': '1.96', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18530304, 'tokens/trainable': 18333032, 'epoch': '3.05'}
 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2262/5680 [6:10:15<7:26:53,  7.84s/it] 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2263/5680 [6:10:22<7:26:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3307', 'grad_norm': '0.2414', 'learning_rate': '0.0001314', 'ppl': '1.392', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18538496, 'tokens/trainable': 18341196, 'epoch': '3.05'}
 40%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                   | 2263/5680 [6:10:22<7:26:57,  7.85s/it] 40%|████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 2264/5680 [6:10:30<7:27:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.9996', 'grad_norm': '0.302', 'learning_rate': '0.0001314', 'ppl': '2.717', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18546688, 'tokens/trainable': 18349364, 'epoch': '3.05'}
 40%|████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 2264/5680 [6:10:30<7:27:13,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 2265/5680 [6:10:38<7:26:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6724', 'grad_norm': '0.2902', 'learning_rate': '0.0001313', 'ppl': '1.959', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18554880, 'tokens/trainable': 18357484, 'epoch': '3.05'}
 40%|████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 2265/5680 [6:10:38<7:26:44,  7.85s/it] 40%|████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 2266/5680 [6:10:46<7:27:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5424', 'grad_norm': '0.3268', 'learning_rate': '0.0001313', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18563072, 'tokens/trainable': 18365622, 'epoch': '3.051'}
 40%|████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 2266/5680 [6:10:46<7:27:08,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2267/5680 [6:10:54<7:26:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7762', 'grad_norm': '0.3333', 'learning_rate': '0.0001312', 'ppl': '2.173', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18571264, 'tokens/trainable': 18373768, 'epoch': '3.051'}
 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2267/5680 [6:10:54<7:26:53,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2268/5680 [6:11:02<7:26:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7871', 'grad_norm': '0.321', 'learning_rate': '0.0001312', 'ppl': '2.197', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 18579456, 'tokens/trainable': 18381930, 'epoch': '3.051'}
 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2268/5680 [6:11:02<7:26:26,  7.85s/it] 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2269/5680 [6:11:09<7:25:49,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7837', 'grad_norm': '0.3449', 'learning_rate': '0.0001311', 'ppl': '2.19', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18587648, 'tokens/trainable': 18390036, 'epoch': '3.051'}
 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2269/5680 [6:11:09<7:25:49,  7.84s/it] 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2270/5680 [6:11:18<7:31:21,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8484', 'grad_norm': '0.461', 'learning_rate': '0.0001311', 'ppl': '2.336', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.9', 'tokens/total': 18595840, 'tokens/trainable': 18398200, 'epoch': '3.051'}
 40%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 2270/5680 [6:11:18<7:31:21,  7.94s/it] 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2271/5680 [6:11:26<7:30:51,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3619', 'grad_norm': '0.2644', 'learning_rate': '0.000131', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 18604032, 'tokens/trainable': 18406288, 'epoch': '3.051'}
 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2271/5680 [6:11:26<7:30:51,  7.94s/it] 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2272/5680 [6:11:33<7:29:39,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5654', 'grad_norm': '0.2736', 'learning_rate': '0.000131', 'ppl': '1.76', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18612224, 'tokens/trainable': 18414444, 'epoch': '3.052'}
 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2272/5680 [6:11:33<7:29:39,  7.92s/it] 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2273/5680 [6:11:41<7:28:37,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.636', 'grad_norm': '0.3658', 'learning_rate': '0.0001309', 'ppl': '1.889', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18620416, 'tokens/trainable': 18422608, 'epoch': '3.052'}
 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2273/5680 [6:11:41<7:28:37,  7.90s/it] 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2274/5680 [6:11:49<7:27:28,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.987', 'grad_norm': '0.3336', 'learning_rate': '0.0001308', 'ppl': '2.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 18628608, 'tokens/trainable': 18430776, 'epoch': '3.052'}
 40%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                   | 2274/5680 [6:11:49<7:27:28,  7.88s/it] 40%|████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 2275/5680 [6:11:57<7:26:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4766', 'grad_norm': '0.3062', 'learning_rate': '0.0001308', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 18636800, 'tokens/trainable': 18438948, 'epoch': '3.052'}
 40%|████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 2275/5680 [6:11:57<7:26:37,  7.87s/it] 40%|████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 2276/5680 [6:12:05<7:26:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3863', 'grad_norm': '0.2524', 'learning_rate': '0.0001307', 'ppl': '1.472', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 18644992, 'tokens/trainable': 18447112, 'epoch': '3.052'}
 40%|████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 2276/5680 [6:12:05<7:26:01,  7.86s/it] 40%|████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 2277/5680 [6:12:13<7:26:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6885', 'grad_norm': '0.3244', 'learning_rate': '0.0001307', 'ppl': '1.991', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18653184, 'tokens/trainable': 18455294, 'epoch': '3.052'}
 40%|████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 2277/5680 [6:12:13<7:26:07,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2278/5680 [6:12:21<7:26:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4087', 'grad_norm': '0.2985', 'learning_rate': '0.0001306', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18661376, 'tokens/trainable': 18463456, 'epoch': '3.053'}
 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2278/5680 [6:12:21<7:26:21,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2279/5680 [6:12:28<7:26:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8378', 'grad_norm': '0.3472', 'learning_rate': '0.0001306', 'ppl': '2.311', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18669568, 'tokens/trainable': 18471624, 'epoch': '3.053'}
 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2279/5680 [6:12:28<7:26:06,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2280/5680 [6:12:37<7:30:50,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.548', 'grad_norm': '0.3043', 'learning_rate': '0.0001305', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.5', 'tokens/total': 18677760, 'tokens/trainable': 18479772, 'epoch': '3.053'}
 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2280/5680 [6:12:37<7:30:50,  7.96s/it] 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2281/5680 [6:12:45<7:29:35,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.56', 'grad_norm': '0.2884', 'learning_rate': '0.0001305', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18685952, 'tokens/trainable': 18487956, 'epoch': '3.053'}
 40%|█████████████████████████████████████████████████████████████████████████████                                                                                                                   | 2281/5680 [6:12:45<7:29:35,  7.94s/it] 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2282/5680 [6:12:52<7:28:33,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5137', 'grad_norm': '0.3883', 'learning_rate': '0.0001304', 'ppl': '1.671', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 18694144, 'tokens/trainable': 18496076, 'epoch': '3.053'}
 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2282/5680 [6:12:52<7:28:33,  7.92s/it] 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2283/5680 [6:13:00<7:27:09,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7721', 'grad_norm': '0.3607', 'learning_rate': '0.0001304', 'ppl': '2.164', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18702336, 'tokens/trainable': 18504188, 'epoch': '3.054'}
 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2283/5680 [6:13:00<7:27:09,  7.90s/it] 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2284/5680 [6:13:08<7:26:25,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6629', 'grad_norm': '0.2784', 'learning_rate': '0.0001303', 'ppl': '1.94', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18710528, 'tokens/trainable': 18512312, 'epoch': '3.054'}
 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2284/5680 [6:13:08<7:26:25,  7.89s/it] 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2285/5680 [6:13:16<7:26:12,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5657', 'grad_norm': '0.3855', 'learning_rate': '0.0001303', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 18718720, 'tokens/trainable': 18520388, 'epoch': '3.054'}
 40%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 2285/5680 [6:13:16<7:26:12,  7.89s/it] 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2286/5680 [6:13:24<7:26:15,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5111', 'grad_norm': '0.2924', 'learning_rate': '0.0001302', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18726912, 'tokens/trainable': 18528562, 'epoch': '3.054'}
 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2286/5680 [6:13:24<7:26:15,  7.89s/it] 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2287/5680 [6:13:32<7:25:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.9789', 'grad_norm': '0.329', 'learning_rate': '0.0001302', 'ppl': '2.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 18735104, 'tokens/trainable': 18536676, 'epoch': '3.054'}
 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2287/5680 [6:13:32<7:25:29,  7.88s/it] 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2288/5680 [6:13:40<7:24:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6348', 'grad_norm': '0.3163', 'learning_rate': '0.0001301', 'ppl': '1.887', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18743296, 'tokens/trainable': 18544808, 'epoch': '3.054'}
 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2288/5680 [6:13:40<7:24:43,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2289/5680 [6:13:47<7:24:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6924', 'grad_norm': '0.3416', 'learning_rate': '0.0001301', 'ppl': '1.999', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18751488, 'tokens/trainable': 18552932, 'epoch': '3.055'}
 40%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 2289/5680 [6:13:47<7:24:09,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 2290/5680 [6:13:55<7:23:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8515', 'grad_norm': '0.3587', 'learning_rate': '0.00013', 'ppl': '2.343', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 18759680, 'tokens/trainable': 18561100, 'epoch': '3.055'}
 40%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 2290/5680 [6:13:55<7:23:29,  7.85s/it] 40%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 2291/5680 [6:14:03<7:24:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4554', 'grad_norm': '0.2763', 'learning_rate': '0.00013', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18767872, 'tokens/trainable': 18569268, 'epoch': '3.055'}
 40%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 2291/5680 [6:14:03<7:24:11,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 2292/5680 [6:14:11<7:23:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4223', 'grad_norm': '0.267', 'learning_rate': '0.0001299', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18776064, 'tokens/trainable': 18577384, 'epoch': '3.055'}
 40%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 2292/5680 [6:14:11<7:23:44,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2293/5680 [6:14:19<7:23:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6028', 'grad_norm': '0.2945', 'learning_rate': '0.0001298', 'ppl': '1.827', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18784256, 'tokens/trainable': 18585554, 'epoch': '3.055'}
 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2293/5680 [6:14:19<7:23:33,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2294/5680 [6:14:27<7:23:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6642', 'grad_norm': '0.3241', 'learning_rate': '0.0001298', 'ppl': '1.943', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18792448, 'tokens/trainable': 18593728, 'epoch': '3.055'}
 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2294/5680 [6:14:27<7:23:51,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2295/5680 [6:14:35<7:23:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8161', 'grad_norm': '0.3241', 'learning_rate': '0.0001297', 'ppl': '2.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18800640, 'tokens/trainable': 18601896, 'epoch': '3.056'}
 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2295/5680 [6:14:35<7:23:47,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2296/5680 [6:14:42<7:23:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6909', 'grad_norm': '0.327', 'learning_rate': '0.0001297', 'ppl': '1.996', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 18808832, 'tokens/trainable': 18610052, 'epoch': '3.056'}
 40%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 2296/5680 [6:14:42<7:23:27,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2297/5680 [6:14:50<7:23:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4529', 'grad_norm': '0.2404', 'learning_rate': '0.0001296', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18817024, 'tokens/trainable': 18618212, 'epoch': '3.056'}
 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2297/5680 [6:14:50<7:23:24,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2298/5680 [6:14:58<7:23:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5749', 'grad_norm': '0.2892', 'learning_rate': '0.0001296', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 18825216, 'tokens/trainable': 18626364, 'epoch': '3.056'}
 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2298/5680 [6:14:58<7:23:21,  7.87s/it] 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2299/5680 [6:15:06<7:22:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4959', 'grad_norm': '0.3209', 'learning_rate': '0.0001295', 'ppl': '1.642', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18833408, 'tokens/trainable': 18634460, 'epoch': '3.056'}
 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2299/5680 [6:15:06<7:22:58,  7.86s/it] 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2300/5680 [6:15:14<7:22:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7403', 'grad_norm': '0.343', 'learning_rate': '0.0001295', 'ppl': '2.096', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18841600, 'tokens/trainable': 18642582, 'epoch': '3.057'}
 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                  | 2300/5680 [6:15:14<7:22:19,  7.85s/it] 41%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 2301/5680 [6:15:22<7:22:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6288', 'grad_norm': '0.3389', 'learning_rate': '0.0001294', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18849792, 'tokens/trainable': 18650752, 'epoch': '3.057'}
 41%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 2301/5680 [6:15:22<7:22:31,  7.86s/it] 41%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 2302/5680 [6:15:30<7:23:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.422', 'grad_norm': '0.2829', 'learning_rate': '0.0001294', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 18857984, 'tokens/trainable': 18658884, 'epoch': '3.057'}
 41%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 2302/5680 [6:15:30<7:23:25,  7.88s/it] 41%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 2303/5680 [6:15:38<7:23:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4715', 'grad_norm': '0.2797', 'learning_rate': '0.0001293', 'ppl': '1.602', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18866176, 'tokens/trainable': 18667028, 'epoch': '3.057'}
 41%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 2303/5680 [6:15:38<7:23:25,  7.88s/it] 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2304/5680 [6:15:45<7:23:17,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.483', 'grad_norm': '0.2907', 'learning_rate': '0.0001293', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18874368, 'tokens/trainable': 18675176, 'epoch': '3.057'}
 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2304/5680 [6:15:45<7:23:17,  7.88s/it] 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2305/5680 [6:15:53<7:22:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5441', 'grad_norm': '0.2991', 'learning_rate': '0.0001292', 'ppl': '1.723', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18882560, 'tokens/trainable': 18683320, 'epoch': '3.057'}
 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2305/5680 [6:15:53<7:22:59,  7.88s/it] 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2306/5680 [6:16:01<7:22:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4092', 'grad_norm': '0.2719', 'learning_rate': '0.0001292', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 18890752, 'tokens/trainable': 18691482, 'epoch': '3.058'}
 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2306/5680 [6:16:01<7:22:52,  7.88s/it] 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2307/5680 [6:16:09<7:22:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7587', 'grad_norm': '0.3408', 'learning_rate': '0.0001291', 'ppl': '2.135', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18898944, 'tokens/trainable': 18699624, 'epoch': '3.058'}
 41%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2307/5680 [6:16:09<7:22:34,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2308/5680 [6:16:17<7:22:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6992', 'grad_norm': '0.2836', 'learning_rate': '0.0001291', 'ppl': '2.012', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18907136, 'tokens/trainable': 18707784, 'epoch': '3.058'}
 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2308/5680 [6:16:17<7:22:04,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2309/5680 [6:16:25<7:22:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5478', 'grad_norm': '0.3029', 'learning_rate': '0.000129', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 18915328, 'tokens/trainable': 18715972, 'epoch': '3.058'}
 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2309/5680 [6:16:25<7:22:07,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2310/5680 [6:16:33<7:22:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5154', 'grad_norm': '0.2529', 'learning_rate': '0.0001289', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18923520, 'tokens/trainable': 18724156, 'epoch': '3.058'}
 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2310/5680 [6:16:33<7:22:20,  7.88s/it] 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2311/5680 [6:16:41<7:22:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6764', 'grad_norm': '0.379', 'learning_rate': '0.0001289', 'ppl': '1.967', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 18931712, 'tokens/trainable': 18732340, 'epoch': '3.058'}
 41%|██████████████████████████████████████████████████████████████████████████████                                                                                                                  | 2311/5680 [6:16:41<7:22:20,  7.88s/it] 41%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 2312/5680 [6:16:48<7:22:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5657', 'grad_norm': '0.2713', 'learning_rate': '0.0001288', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18939904, 'tokens/trainable': 18740468, 'epoch': '3.059'}
 41%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 2312/5680 [6:16:48<7:22:14,  7.88s/it] 41%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 2313/5680 [6:16:57<7:27:21,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5427', 'grad_norm': '0.4099', 'learning_rate': '0.0001288', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.8', 'tokens/total': 18948096, 'tokens/trainable': 18748640, 'epoch': '3.059'}
 41%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 2313/5680 [6:16:57<7:27:21,  7.97s/it] 41%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 2314/5680 [6:17:04<7:25:14,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4047', 'grad_norm': '0.2881', 'learning_rate': '0.0001287', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18956288, 'tokens/trainable': 18756742, 'epoch': '3.059'}
 41%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 2314/5680 [6:17:04<7:25:14,  7.94s/it] 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2315/5680 [6:17:12<7:23:11,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7643', 'grad_norm': '0.341', 'learning_rate': '0.0001287', 'ppl': '2.147', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 18964480, 'tokens/trainable': 18764896, 'epoch': '3.059'}
 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2315/5680 [6:17:12<7:23:11,  7.90s/it] 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2316/5680 [6:17:20<7:22:50,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6287', 'grad_norm': '0.3464', 'learning_rate': '0.0001286', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 18972672, 'tokens/trainable': 18773006, 'epoch': '3.059'}
 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2316/5680 [6:17:20<7:22:50,  7.90s/it] 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2317/5680 [6:17:28<7:21:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7917', 'grad_norm': '0.3219', 'learning_rate': '0.0001286', 'ppl': '2.207', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 18980864, 'tokens/trainable': 18781064, 'epoch': '3.06'}
 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2317/5680 [6:17:28<7:21:17,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2318/5680 [6:17:36<7:21:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7158', 'grad_norm': '0.3656', 'learning_rate': '0.0001285', 'ppl': '2.046', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 18989056, 'tokens/trainable': 18789210, 'epoch': '3.06'}
 41%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2318/5680 [6:17:36<7:21:07,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2319/5680 [6:17:44<7:20:59,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2664', 'grad_norm': '0.2073', 'learning_rate': '0.0001285', 'ppl': '1.305', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 18997248, 'tokens/trainable': 18797336, 'epoch': '3.06'}
 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2319/5680 [6:17:44<7:20:59,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2320/5680 [6:17:52<7:20:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.317', 'grad_norm': '0.2224', 'learning_rate': '0.0001284', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19005440, 'tokens/trainable': 18805492, 'epoch': '3.06'}
 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2320/5680 [6:17:52<7:20:35,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2321/5680 [6:17:59<7:19:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7507', 'grad_norm': '0.3144', 'learning_rate': '0.0001284', 'ppl': '2.118', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 19013632, 'tokens/trainable': 18813656, 'epoch': '3.06'}
 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2321/5680 [6:17:59<7:19:59,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2322/5680 [6:18:07<7:19:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5736', 'grad_norm': '0.2707', 'learning_rate': '0.0001283', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19021824, 'tokens/trainable': 18821812, 'epoch': '3.06'}
 41%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2322/5680 [6:18:07<7:19:38,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 2323/5680 [6:18:15<7:18:41,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7133', 'grad_norm': '0.2788', 'learning_rate': '0.0001283', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1048', 'tokens/total': 19030016, 'tokens/trainable': 18829992, 'epoch': '3.061'}
 41%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 2323/5680 [6:18:15<7:18:41,  7.84s/it] 41%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 2324/5680 [6:18:23<7:18:39,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7541', 'grad_norm': '0.2977', 'learning_rate': '0.0001282', 'ppl': '2.126', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 19038208, 'tokens/trainable': 18838108, 'epoch': '3.061'}
 41%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 2324/5680 [6:18:23<7:18:39,  7.84s/it] 41%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 2325/5680 [6:18:31<7:18:49,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7245', 'grad_norm': '0.2952', 'learning_rate': '0.0001282', 'ppl': '2.064', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19046400, 'tokens/trainable': 18846276, 'epoch': '3.061'}
 41%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                 | 2325/5680 [6:18:31<7:18:49,  7.85s/it] 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2326/5680 [6:18:39<7:19:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4996', 'grad_norm': '0.3337', 'learning_rate': '0.0001281', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 19054592, 'tokens/trainable': 18854346, 'epoch': '3.061'}
 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2326/5680 [6:18:39<7:19:37,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2327/5680 [6:18:47<7:19:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6501', 'grad_norm': '0.3115', 'learning_rate': '0.000128', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19062784, 'tokens/trainable': 18862468, 'epoch': '3.061'}
 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2327/5680 [6:18:47<7:19:00,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2328/5680 [6:18:54<7:19:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7365', 'grad_norm': '0.3264', 'learning_rate': '0.000128', 'ppl': '2.089', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 19070976, 'tokens/trainable': 18870568, 'epoch': '3.061'}
 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2328/5680 [6:18:54<7:19:12,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2329/5680 [6:19:02<7:19:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4365', 'grad_norm': '0.2982', 'learning_rate': '0.0001279', 'ppl': '1.547', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 19079168, 'tokens/trainable': 18878678, 'epoch': '3.062'}
 41%|██████████████████████████████████████████████████████████████████████████████▋                                                                                                                 | 2329/5680 [6:19:02<7:19:03,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2330/5680 [6:19:10<7:19:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.684', 'grad_norm': '0.3658', 'learning_rate': '0.0001279', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19087360, 'tokens/trainable': 18886824, 'epoch': '3.062'}
 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2330/5680 [6:19:10<7:19:15,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2331/5680 [6:19:18<7:19:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5521', 'grad_norm': '0.2878', 'learning_rate': '0.0001278', 'ppl': '1.737', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19095552, 'tokens/trainable': 18894976, 'epoch': '3.062'}
 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2331/5680 [6:19:18<7:19:13,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2332/5680 [6:19:26<7:18:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5445', 'grad_norm': '0.3004', 'learning_rate': '0.0001278', 'ppl': '1.724', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19103744, 'tokens/trainable': 18903104, 'epoch': '3.062'}
 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2332/5680 [6:19:26<7:18:39,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2333/5680 [6:19:34<7:18:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5003', 'grad_norm': '0.2893', 'learning_rate': '0.0001277', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19111936, 'tokens/trainable': 18911254, 'epoch': '3.062'}
 41%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2333/5680 [6:19:34<7:18:13,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2334/5680 [6:19:42<7:18:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.27', 'grad_norm': '0.2416', 'learning_rate': '0.0001277', 'ppl': '1.31', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19120128, 'tokens/trainable': 18919414, 'epoch': '3.062'}
 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2334/5680 [6:19:42<7:18:26,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2335/5680 [6:19:49<7:18:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.481', 'grad_norm': '0.2621', 'learning_rate': '0.0001276', 'ppl': '1.618', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19128320, 'tokens/trainable': 18927600, 'epoch': '3.063'}
 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2335/5680 [6:19:49<7:18:44,  7.87s/it] 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2336/5680 [6:19:57<7:18:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7163', 'grad_norm': '0.3594', 'learning_rate': '0.0001276', 'ppl': '2.047', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19136512, 'tokens/trainable': 18935736, 'epoch': '3.063'}
 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2336/5680 [6:19:57<7:18:10,  7.86s/it] 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2337/5680 [6:20:05<7:18:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5838', 'grad_norm': '0.3397', 'learning_rate': '0.0001275', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19144704, 'tokens/trainable': 18943882, 'epoch': '3.063'}
 41%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 2337/5680 [6:20:05<7:18:22,  7.87s/it] 41%|███████████████████████████████████████████████████████████████████████████████                                                                                                                 | 2338/5680 [6:20:13<7:18:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6701', 'grad_norm': '0.3098', 'learning_rate': '0.0001275', 'ppl': '1.954', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19152896, 'tokens/trainable': 18952032, 'epoch': '3.063'}
 41%|███████████████████████████████████████████████████████████████████████████████                                                                                                                 | 2338/5680 [6:20:13<7:18:02,  7.86s/it] 41%|███████████████████████████████████████████████████████████████████████████████                                                                                                                 | 2339/5680 [6:20:21<7:17:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5563', 'grad_norm': '0.3143', 'learning_rate': '0.0001274', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19161088, 'tokens/trainable': 18960196, 'epoch': '3.063'}
 41%|███████████████████████████████████████████████████████████████████████████████                                                                                                                 | 2339/5680 [6:20:21<7:17:56,  7.86s/it] 41%|███████████████████████████████████████████████████████████████████████████████                                                                                                                 | 2340/5680 [6:20:29<7:17:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6141', 'grad_norm': '0.3237', 'learning_rate': '0.0001274', 'ppl': '1.848', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 19169280, 'tokens/trainable': 18968380, 'epoch': '3.064'}
 41%|███████████████████████████████████████████████████████████████████████████████                                                                                                                 | 2340/5680 [6:20:29<7:17:24,  7.86s/it] 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2341/5680 [6:20:37<7:17:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.937', 'grad_norm': '0.3457', 'learning_rate': '0.0001273', 'ppl': '2.552', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19177472, 'tokens/trainable': 18976536, 'epoch': '3.064'}
 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2341/5680 [6:20:37<7:17:33,  7.86s/it] 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2342/5680 [6:20:45<7:17:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4668', 'grad_norm': '0.2698', 'learning_rate': '0.0001273', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19185664, 'tokens/trainable': 18984714, 'epoch': '3.064'}
 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2342/5680 [6:20:45<7:17:36,  7.87s/it] 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2343/5680 [6:20:52<7:17:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4532', 'grad_norm': '0.2983', 'learning_rate': '0.0001272', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19193856, 'tokens/trainable': 18992892, 'epoch': '3.064'}
 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2343/5680 [6:20:52<7:17:40,  7.87s/it] 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2344/5680 [6:21:00<7:17:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4633', 'grad_norm': '0.2847', 'learning_rate': '0.0001271', 'ppl': '1.589', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 19202048, 'tokens/trainable': 19000980, 'epoch': '3.064'}
 41%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 2344/5680 [6:21:00<7:17:15,  7.86s/it] 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2345/5680 [6:21:08<7:16:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4525', 'grad_norm': '0.2969', 'learning_rate': '0.0001271', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19210240, 'tokens/trainable': 19009100, 'epoch': '3.064'}
 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2345/5680 [6:21:08<7:16:32,  7.85s/it] 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2346/5680 [6:21:16<7:16:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8379', 'grad_norm': '0.3144', 'learning_rate': '0.000127', 'ppl': '2.312', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 19218432, 'tokens/trainable': 19017224, 'epoch': '3.065'}
 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2346/5680 [6:21:16<7:16:20,  7.85s/it] 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2347/5680 [6:21:24<7:16:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7736', 'grad_norm': '0.3723', 'learning_rate': '0.000127', 'ppl': '2.168', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 19226624, 'tokens/trainable': 19025416, 'epoch': '3.065'}
 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2347/5680 [6:21:24<7:16:15,  7.85s/it] 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2348/5680 [6:21:32<7:16:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4004', 'grad_norm': '0.2539', 'learning_rate': '0.0001269', 'ppl': '1.492', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19234816, 'tokens/trainable': 19033584, 'epoch': '3.065'}
 41%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 2348/5680 [6:21:32<7:16:29,  7.86s/it] 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                | 2349/5680 [6:21:40<7:16:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4647', 'grad_norm': '0.2743', 'learning_rate': '0.0001269', 'ppl': '1.591', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19243008, 'tokens/trainable': 19041760, 'epoch': '3.065'}
 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                | 2349/5680 [6:21:40<7:16:54,  7.87s/it] 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                | 2350/5680 [6:21:47<7:17:37,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7208', 'grad_norm': '0.3601', 'learning_rate': '0.0001268', 'ppl': '2.056', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19251200, 'tokens/trainable': 19049948, 'epoch': '3.065'}
 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                | 2350/5680 [6:21:47<7:17:37,  7.89s/it] 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                | 2351/5680 [6:21:56<7:21:47,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4946', 'grad_norm': '0.3019', 'learning_rate': '0.0001268', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.2', 'tokens/total': 19259392, 'tokens/trainable': 19058042, 'epoch': '3.065'}
 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                | 2351/5680 [6:21:56<7:21:47,  7.96s/it] 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2352/5680 [6:22:03<7:20:12,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5792', 'grad_norm': '0.3678', 'learning_rate': '0.0001267', 'ppl': '1.785', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19267584, 'tokens/trainable': 19066230, 'epoch': '3.066'}
 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2352/5680 [6:22:03<7:20:12,  7.94s/it] 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2353/5680 [6:22:11<7:18:45,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6873', 'grad_norm': '0.314', 'learning_rate': '0.0001267', 'ppl': '1.988', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19275776, 'tokens/trainable': 19074364, 'epoch': '3.066'}
 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2353/5680 [6:22:11<7:18:45,  7.91s/it] 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2354/5680 [6:22:19<7:17:08,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.541', 'grad_norm': '0.2863', 'learning_rate': '0.0001266', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19283968, 'tokens/trainable': 19082476, 'epoch': '3.066'}
 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2354/5680 [6:22:19<7:17:08,  7.89s/it] 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2355/5680 [6:22:27<7:16:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5557', 'grad_norm': '0.3176', 'learning_rate': '0.0001266', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 19292160, 'tokens/trainable': 19090616, 'epoch': '3.066'}
 41%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 2355/5680 [6:22:27<7:16:57,  7.88s/it] 41%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2356/5680 [6:22:35<7:21:23,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5208', 'grad_norm': '0.2534', 'learning_rate': '0.0001265', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 19300352, 'tokens/trainable': 19098776, 'epoch': '3.066'}
 41%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2356/5680 [6:22:35<7:21:23,  7.97s/it] 41%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2357/5680 [6:22:43<7:19:07,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6001', 'grad_norm': '0.3582', 'learning_rate': '0.0001265', 'ppl': '1.822', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 19308544, 'tokens/trainable': 19106962, 'epoch': '3.067'}
 41%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2357/5680 [6:22:43<7:19:07,  7.93s/it] 42%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2358/5680 [6:22:51<7:17:39,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7977', 'grad_norm': '0.3226', 'learning_rate': '0.0001264', 'ppl': '2.22', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 19316736, 'tokens/trainable': 19115138, 'epoch': '3.067'}
 42%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2358/5680 [6:22:51<7:17:39,  7.90s/it] 42%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2359/5680 [6:22:59<7:16:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4203', 'grad_norm': '0.2657', 'learning_rate': '0.0001263', 'ppl': '1.522', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19324928, 'tokens/trainable': 19123274, 'epoch': '3.067'}
 42%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 2359/5680 [6:22:59<7:16:25,  7.88s/it] 42%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 2360/5680 [6:23:07<7:15:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.573', 'grad_norm': '0.3242', 'learning_rate': '0.0001263', 'ppl': '1.774', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19333120, 'tokens/trainable': 19131440, 'epoch': '3.067'}
 42%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 2360/5680 [6:23:07<7:15:49,  7.88s/it] 42%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 2361/5680 [6:23:14<7:15:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7923', 'grad_norm': '0.3451', 'learning_rate': '0.0001262', 'ppl': '2.209', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19341312, 'tokens/trainable': 19139608, 'epoch': '3.067'}
 42%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 2361/5680 [6:23:14<7:15:40,  7.88s/it] 42%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 2362/5680 [6:23:22<7:15:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6733', 'grad_norm': '0.3125', 'learning_rate': '0.0001262', 'ppl': '1.961', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 19349504, 'tokens/trainable': 19147680, 'epoch': '3.067'}
 42%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                | 2362/5680 [6:23:22<7:15:10,  7.87s/it] 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2363/5680 [6:23:30<7:14:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5447', 'grad_norm': '0.3701', 'learning_rate': '0.0001261', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19357696, 'tokens/trainable': 19155832, 'epoch': '3.068'}
 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2363/5680 [6:23:30<7:14:37,  7.86s/it] 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2364/5680 [6:23:38<7:14:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6354', 'grad_norm': '0.3184', 'learning_rate': '0.0001261', 'ppl': '1.888', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 19365888, 'tokens/trainable': 19164000, 'epoch': '3.068'}
 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2364/5680 [6:23:38<7:14:04,  7.85s/it] 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2365/5680 [6:23:46<7:14:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4107', 'grad_norm': '0.2903', 'learning_rate': '0.000126', 'ppl': '1.508', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 19374080, 'tokens/trainable': 19172132, 'epoch': '3.068'}
 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2365/5680 [6:23:46<7:14:22,  7.86s/it] 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2366/5680 [6:23:54<7:13:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6119', 'grad_norm': '0.3047', 'learning_rate': '0.000126', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19382272, 'tokens/trainable': 19180298, 'epoch': '3.068'}
 42%|███████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 2366/5680 [6:23:54<7:13:58,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2367/5680 [6:24:02<7:13:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4801', 'grad_norm': '0.2664', 'learning_rate': '0.0001259', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19390464, 'tokens/trainable': 19188404, 'epoch': '3.068'}
 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2367/5680 [6:24:02<7:13:38,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2368/5680 [6:24:09<7:13:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4322', 'grad_norm': '0.2775', 'learning_rate': '0.0001259', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19398656, 'tokens/trainable': 19196516, 'epoch': '3.068'}
 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2368/5680 [6:24:09<7:13:26,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2369/5680 [6:24:17<7:13:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5658', 'grad_norm': '0.2673', 'learning_rate': '0.0001258', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19406848, 'tokens/trainable': 19204668, 'epoch': '3.069'}
 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2369/5680 [6:24:17<7:13:31,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2370/5680 [6:24:25<7:13:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3898', 'grad_norm': '0.2632', 'learning_rate': '0.0001258', 'ppl': '1.477', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19415040, 'tokens/trainable': 19212804, 'epoch': '3.069'}
 42%|████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2370/5680 [6:24:25<7:13:06,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2371/5680 [6:24:33<7:12:35,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.9412', 'grad_norm': '0.3206', 'learning_rate': '0.0001257', 'ppl': '2.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19423232, 'tokens/trainable': 19220896, 'epoch': '3.069'}
 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2371/5680 [6:24:33<7:12:35,  7.84s/it] 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2372/5680 [6:24:41<7:12:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.556', 'grad_norm': '0.3155', 'learning_rate': '0.0001257', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 19431424, 'tokens/trainable': 19229076, 'epoch': '3.069'}
 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2372/5680 [6:24:41<7:12:38,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2373/5680 [6:24:49<7:12:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5986', 'grad_norm': '0.3426', 'learning_rate': '0.0001256', 'ppl': '1.82', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19439616, 'tokens/trainable': 19237240, 'epoch': '3.069'}
 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2373/5680 [6:24:49<7:12:51,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2374/5680 [6:24:57<7:13:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5355', 'grad_norm': '0.2698', 'learning_rate': '0.0001255', 'ppl': '1.708', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19447808, 'tokens/trainable': 19245410, 'epoch': '3.07'}
 42%|████████████████████████████████████████████████████████████████████████████████▏                                                                                                               | 2374/5680 [6:24:57<7:13:06,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2375/5680 [6:25:04<7:12:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7737', 'grad_norm': '0.3679', 'learning_rate': '0.0001255', 'ppl': '2.168', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 19456000, 'tokens/trainable': 19253492, 'epoch': '3.07'}
 42%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2375/5680 [6:25:04<7:12:47,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2376/5680 [6:25:12<7:13:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4724', 'grad_norm': '0.3099', 'learning_rate': '0.0001254', 'ppl': '1.604', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 19464192, 'tokens/trainable': 19261652, 'epoch': '3.07'}
 42%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2376/5680 [6:25:12<7:13:06,  7.87s/it] 42%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2377/5680 [6:25:20<7:13:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8455', 'grad_norm': '0.3511', 'learning_rate': '0.0001254', 'ppl': '2.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 19472384, 'tokens/trainable': 19269780, 'epoch': '3.07'}
 42%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2377/5680 [6:25:20<7:13:11,  7.87s/it] 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2378/5680 [6:25:28<7:13:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5642', 'grad_norm': '0.3471', 'learning_rate': '0.0001253', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 19480576, 'tokens/trainable': 19277828, 'epoch': '3.07'}
 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2378/5680 [6:25:28<7:13:25,  7.88s/it] 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2379/5680 [6:25:36<7:12:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4182', 'grad_norm': '0.2646', 'learning_rate': '0.0001253', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19488768, 'tokens/trainable': 19285960, 'epoch': '3.07'}
 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2379/5680 [6:25:36<7:12:39,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2380/5680 [6:25:44<7:12:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7595', 'grad_norm': '0.3285', 'learning_rate': '0.0001252', 'ppl': '2.137', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19496960, 'tokens/trainable': 19294120, 'epoch': '3.071'}
 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2380/5680 [6:25:44<7:12:08,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2381/5680 [6:25:52<7:11:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8027', 'grad_norm': '0.3076', 'learning_rate': '0.0001252', 'ppl': '2.231', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19505152, 'tokens/trainable': 19302290, 'epoch': '3.071'}
 42%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2381/5680 [6:25:52<7:11:59,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2382/5680 [6:25:59<7:12:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4686', 'grad_norm': '0.2971', 'learning_rate': '0.0001251', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 19513344, 'tokens/trainable': 19310452, 'epoch': '3.071'}
 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2382/5680 [6:25:59<7:12:40,  7.87s/it] 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2383/5680 [6:26:07<7:11:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7333', 'grad_norm': '0.3388', 'learning_rate': '0.0001251', 'ppl': '2.082', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 19521536, 'tokens/trainable': 19318614, 'epoch': '3.071'}
 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2383/5680 [6:26:07<7:11:51,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2384/5680 [6:26:15<7:11:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5639', 'grad_norm': '0.2688', 'learning_rate': '0.000125', 'ppl': '1.757', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19529728, 'tokens/trainable': 19326792, 'epoch': '3.071'}
 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2384/5680 [6:26:15<7:11:55,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2385/5680 [6:26:23<7:11:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6494', 'grad_norm': '0.358', 'learning_rate': '0.000125', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19537920, 'tokens/trainable': 19334908, 'epoch': '3.071'}
 42%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 2385/5680 [6:26:23<7:11:19,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 2386/5680 [6:26:31<7:11:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4458', 'grad_norm': '0.2867', 'learning_rate': '0.0001249', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19546112, 'tokens/trainable': 19343012, 'epoch': '3.072'}
 42%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 2386/5680 [6:26:31<7:11:00,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 2387/5680 [6:26:39<7:09:59,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.6711', 'grad_norm': '0.2855', 'learning_rate': '0.0001248', 'ppl': '1.956', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19554304, 'tokens/trainable': 19351124, 'epoch': '3.072'}
 42%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 2387/5680 [6:26:39<7:09:59,  7.83s/it] 42%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 2388/5680 [6:26:47<7:10:15,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7552', 'grad_norm': '0.3103', 'learning_rate': '0.0001248', 'ppl': '2.128', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19562496, 'tokens/trainable': 19359280, 'epoch': '3.072'}
 42%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 2388/5680 [6:26:47<7:10:15,  7.84s/it] 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2389/5680 [6:26:54<7:10:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6828', 'grad_norm': '0.3391', 'learning_rate': '0.0001247', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19570688, 'tokens/trainable': 19367448, 'epoch': '3.072'}
 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2389/5680 [6:26:54<7:10:35,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2390/5680 [6:27:02<7:11:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7295', 'grad_norm': '0.3503', 'learning_rate': '0.0001247', 'ppl': '2.074', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 19578880, 'tokens/trainable': 19375568, 'epoch': '3.072'}
 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2390/5680 [6:27:02<7:11:31,  7.87s/it] 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2391/5680 [6:27:10<7:10:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3362', 'grad_norm': '0.2623', 'learning_rate': '0.0001246', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19587072, 'tokens/trainable': 19383708, 'epoch': '3.073'}
 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2391/5680 [6:27:10<7:10:59,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2392/5680 [6:27:18<7:11:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.58', 'grad_norm': '0.347', 'learning_rate': '0.0001246', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19595264, 'tokens/trainable': 19391848, 'epoch': '3.073'}
 42%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 2392/5680 [6:27:18<7:11:04,  7.87s/it] 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2393/5680 [6:27:26<7:10:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4598', 'grad_norm': '0.2959', 'learning_rate': '0.0001245', 'ppl': '1.584', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19603456, 'tokens/trainable': 19400002, 'epoch': '3.073'}
 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2393/5680 [6:27:26<7:10:39,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2394/5680 [6:27:34<7:10:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7979', 'grad_norm': '0.3158', 'learning_rate': '0.0001245', 'ppl': '2.221', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19611648, 'tokens/trainable': 19408136, 'epoch': '3.073'}
 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2394/5680 [6:27:34<7:10:07,  7.85s/it] 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2395/5680 [6:27:42<7:10:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6887', 'grad_norm': '0.3662', 'learning_rate': '0.0001244', 'ppl': '1.991', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19619840, 'tokens/trainable': 19416324, 'epoch': '3.073'}
 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2395/5680 [6:27:42<7:10:10,  7.86s/it] 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2396/5680 [6:27:49<7:10:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7113', 'grad_norm': '0.4097', 'learning_rate': '0.0001244', 'ppl': '2.037', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 19628032, 'tokens/trainable': 19424432, 'epoch': '3.073'}
 42%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 2396/5680 [6:27:49<7:10:26,  7.86s/it] 42%|█████████████████████████████████████████████████████████████████████████████████                                                                                                               | 2397/5680 [6:27:57<7:10:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3942', 'grad_norm': '0.2816', 'learning_rate': '0.0001243', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 19636224, 'tokens/trainable': 19432572, 'epoch': '3.074'}
 42%|█████████████████████████████████████████████████████████████████████████████████                                                                                                               | 2397/5680 [6:27:57<7:10:49,  7.87s/it] 42%|█████████████████████████████████████████████████████████████████████████████████                                                                                                               | 2398/5680 [6:28:05<7:11:02,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5618', 'grad_norm': '0.287', 'learning_rate': '0.0001243', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19644416, 'tokens/trainable': 19440734, 'epoch': '3.074'}
 42%|█████████████████████████████████████████████████████████████████████████████████                                                                                                               | 2398/5680 [6:28:05<7:11:02,  7.88s/it] 42%|█████████████████████████████████████████████████████████████████████████████████                                                                                                               | 2399/5680 [6:28:13<7:14:56,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7325', 'grad_norm': '0.3164', 'learning_rate': '0.0001242', 'ppl': '2.08', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 19652608, 'tokens/trainable': 19448904, 'epoch': '3.074'}
 42%|█████████████████████████████████████████████████████████████████████████████████                                                                                                               | 2399/5680 [6:28:13<7:14:56,  7.95s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2400/5680 [6:28:21<7:13:18,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '1.027', 'grad_norm': '0.4292', 'learning_rate': '0.0001242', 'ppl': '2.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19660800, 'tokens/trainable': 19457088, 'epoch': '3.074'}
 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2400/5680 [6:28:21<7:13:18,  7.93s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2401/5680 [6:28:29<7:12:00,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.785', 'grad_norm': '0.3509', 'learning_rate': '0.0001241', 'ppl': '2.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19668992, 'tokens/trainable': 19465260, 'epoch': '3.074'}
 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2401/5680 [6:28:29<7:12:00,  7.90s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2402/5680 [6:28:37<7:11:38,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4287', 'grad_norm': '0.3312', 'learning_rate': '0.000124', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 19677184, 'tokens/trainable': 19473400, 'epoch': '3.074'}
 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2402/5680 [6:28:37<7:11:38,  7.90s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2403/5680 [6:28:45<7:11:06,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.581', 'grad_norm': '0.3684', 'learning_rate': '0.000124', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19685376, 'tokens/trainable': 19481564, 'epoch': '3.075'}
 42%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 2403/5680 [6:28:45<7:11:06,  7.89s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2404/5680 [6:28:53<7:10:45,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7248', 'grad_norm': '0.3112', 'learning_rate': '0.0001239', 'ppl': '2.064', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19693568, 'tokens/trainable': 19489752, 'epoch': '3.075'}
 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2404/5680 [6:28:53<7:10:45,  7.89s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2405/5680 [6:29:01<7:11:28,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6662', 'grad_norm': '0.3841', 'learning_rate': '0.0001239', 'ppl': '1.947', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 19701760, 'tokens/trainable': 19497908, 'epoch': '3.075'}
 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2405/5680 [6:29:01<7:11:28,  7.90s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2406/5680 [6:29:09<7:11:48,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6543', 'grad_norm': '0.4116', 'learning_rate': '0.0001238', 'ppl': '1.924', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 19709952, 'tokens/trainable': 19506022, 'epoch': '3.075'}
 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2406/5680 [6:29:09<7:11:48,  7.91s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2407/5680 [6:29:16<7:11:06,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4008', 'grad_norm': '0.2757', 'learning_rate': '0.0001238', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 19718144, 'tokens/trainable': 19514128, 'epoch': '3.075'}
 42%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 2407/5680 [6:29:16<7:11:06,  7.90s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2408/5680 [6:29:24<7:10:21,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6706', 'grad_norm': '0.3417', 'learning_rate': '0.0001237', 'ppl': '1.955', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 19726336, 'tokens/trainable': 19522240, 'epoch': '3.076'}
 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2408/5680 [6:29:24<7:10:21,  7.89s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2409/5680 [6:29:32<7:08:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7998', 'grad_norm': '0.3008', 'learning_rate': '0.0001237', 'ppl': '2.225', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 19734528, 'tokens/trainable': 19530388, 'epoch': '3.076'}
 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2409/5680 [6:29:32<7:08:55,  7.87s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2410/5680 [6:29:40<7:08:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4845', 'grad_norm': '0.3011', 'learning_rate': '0.0001236', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19742720, 'tokens/trainable': 19538572, 'epoch': '3.076'}
 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2410/5680 [6:29:40<7:08:45,  7.87s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2411/5680 [6:29:48<7:08:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5217', 'grad_norm': '0.2942', 'learning_rate': '0.0001236', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19750912, 'tokens/trainable': 19546740, 'epoch': '3.076'}
 42%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 2411/5680 [6:29:48<7:08:24,  7.86s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 2412/5680 [6:29:56<7:08:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8295', 'grad_norm': '0.3703', 'learning_rate': '0.0001235', 'ppl': '2.292', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 19759104, 'tokens/trainable': 19554844, 'epoch': '3.076'}
 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 2412/5680 [6:29:56<7:08:33,  7.87s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 2413/5680 [6:30:04<7:09:34,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4867', 'grad_norm': '0.3769', 'learning_rate': '0.0001235', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 19767296, 'tokens/trainable': 19563024, 'epoch': '3.076'}
 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 2413/5680 [6:30:04<7:09:34,  7.89s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 2414/5680 [6:30:12<7:09:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5087', 'grad_norm': '0.3155', 'learning_rate': '0.0001234', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 19775488, 'tokens/trainable': 19571204, 'epoch': '3.077'}
 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 2414/5680 [6:30:12<7:09:11,  7.88s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2415/5680 [6:30:19<7:08:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7338', 'grad_norm': '0.3875', 'learning_rate': '0.0001233', 'ppl': '2.083', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19783680, 'tokens/trainable': 19579300, 'epoch': '3.077'}
 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2415/5680 [6:30:19<7:08:13,  7.87s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2416/5680 [6:30:27<7:07:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5591', 'grad_norm': '0.3068', 'learning_rate': '0.0001233', 'ppl': '1.749', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19791872, 'tokens/trainable': 19587420, 'epoch': '3.077'}
 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2416/5680 [6:30:27<7:07:38,  7.86s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2417/5680 [6:30:35<7:08:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3228', 'grad_norm': '0.231', 'learning_rate': '0.0001232', 'ppl': '1.381', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19800064, 'tokens/trainable': 19595608, 'epoch': '3.077'}
 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2417/5680 [6:30:35<7:08:11,  7.87s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2418/5680 [6:30:43<7:08:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6509', 'grad_norm': '0.3154', 'learning_rate': '0.0001232', 'ppl': '1.917', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 19808256, 'tokens/trainable': 19603752, 'epoch': '3.077'}
 43%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 2418/5680 [6:30:43<7:08:23,  7.88s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2419/5680 [6:30:51<7:07:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6347', 'grad_norm': '0.3549', 'learning_rate': '0.0001231', 'ppl': '1.886', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19816448, 'tokens/trainable': 19611876, 'epoch': '3.077'}
 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2419/5680 [6:30:51<7:07:07,  7.86s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2420/5680 [6:30:59<7:08:07,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4266', 'grad_norm': '0.2901', 'learning_rate': '0.0001231', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 19824640, 'tokens/trainable': 19620024, 'epoch': '3.078'}
 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2420/5680 [6:30:59<7:08:07,  7.88s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2421/5680 [6:31:07<7:07:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6294', 'grad_norm': '0.3312', 'learning_rate': '0.000123', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 19832832, 'tokens/trainable': 19628204, 'epoch': '3.078'}
 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2421/5680 [6:31:07<7:07:11,  7.86s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2422/5680 [6:31:14<7:06:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5365', 'grad_norm': '0.2809', 'learning_rate': '0.000123', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19841024, 'tokens/trainable': 19636336, 'epoch': '3.078'}
 43%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 2422/5680 [6:31:14<7:06:44,  7.86s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                              | 2423/5680 [6:31:22<7:06:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6319', 'grad_norm': '0.3231', 'learning_rate': '0.0001229', 'ppl': '1.881', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19849216, 'tokens/trainable': 19644496, 'epoch': '3.078'}
 43%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                              | 2423/5680 [6:31:22<7:06:27,  7.86s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                              | 2424/5680 [6:31:30<7:07:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5682', 'grad_norm': '0.3385', 'learning_rate': '0.0001229', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 19857408, 'tokens/trainable': 19652622, 'epoch': '3.078'}
 43%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                              | 2424/5680 [6:31:30<7:07:04,  7.87s/it] 43%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                              | 2425/5680 [6:31:38<7:06:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8254', 'grad_norm': '0.3193', 'learning_rate': '0.0001228', 'ppl': '2.283', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 19865600, 'tokens/trainable': 19660732, 'epoch': '3.079'}
 43%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                              | 2425/5680 [6:31:38<7:06:17,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2426/5680 [6:31:46<7:06:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7166', 'grad_norm': '0.3032', 'learning_rate': '0.0001228', 'ppl': '2.047', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 19873792, 'tokens/trainable': 19668822, 'epoch': '3.079'}
 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2426/5680 [6:31:46<7:06:09,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2427/5680 [6:31:54<7:06:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4905', 'grad_norm': '0.3425', 'learning_rate': '0.0001227', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 19881984, 'tokens/trainable': 19676940, 'epoch': '3.079'}
 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2427/5680 [6:31:54<7:06:01,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2428/5680 [6:32:02<7:05:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6789', 'grad_norm': '0.2902', 'learning_rate': '0.0001226', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 19890176, 'tokens/trainable': 19685112, 'epoch': '3.079'}
 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2428/5680 [6:32:02<7:05:57,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2429/5680 [6:32:10<7:06:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.593', 'grad_norm': '0.2923', 'learning_rate': '0.0001226', 'ppl': '1.809', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 19898368, 'tokens/trainable': 19693190, 'epoch': '3.079'}
 43%|██████████████████████████████████████████████████████████████████████████████████                                                                                                              | 2429/5680 [6:32:10<7:06:59,  7.88s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2430/5680 [6:32:18<7:13:10,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6815', 'grad_norm': '0.3498', 'learning_rate': '0.0001225', 'ppl': '1.977', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.3', 'tokens/total': 19906560, 'tokens/trainable': 19701352, 'epoch': '3.079'}
 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2430/5680 [6:32:18<7:13:10,  8.00s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2431/5680 [6:32:26<7:11:20,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.528', 'grad_norm': '0.3684', 'learning_rate': '0.0001225', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 19914752, 'tokens/trainable': 19709444, 'epoch': '3.08'}
 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2431/5680 [6:32:26<7:11:20,  7.97s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2432/5680 [6:32:34<7:10:24,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.437', 'grad_norm': '0.2774', 'learning_rate': '0.0001224', 'ppl': '1.548', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 19922944, 'tokens/trainable': 19717612, 'epoch': '3.08'}
 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2432/5680 [6:32:34<7:10:24,  7.95s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2433/5680 [6:32:41<7:08:24,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4577', 'grad_norm': '0.2707', 'learning_rate': '0.0001224', 'ppl': '1.58', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 19931136, 'tokens/trainable': 19725768, 'epoch': '3.08'}
 43%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 2433/5680 [6:32:41<7:08:24,  7.92s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 2434/5680 [6:32:49<7:06:40,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5324', 'grad_norm': '0.2905', 'learning_rate': '0.0001223', 'ppl': '1.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19939328, 'tokens/trainable': 19733852, 'epoch': '3.08'}
 43%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 2434/5680 [6:32:49<7:06:40,  7.89s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 2435/5680 [6:32:57<7:05:58,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.615', 'grad_norm': '0.3434', 'learning_rate': '0.0001223', 'ppl': '1.85', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 19947520, 'tokens/trainable': 19742002, 'epoch': '3.08'}
 43%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 2435/5680 [6:32:57<7:05:58,  7.88s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 2436/5680 [6:33:05<7:05:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6487', 'grad_norm': '0.3563', 'learning_rate': '0.0001222', 'ppl': '1.913', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 19955712, 'tokens/trainable': 19750096, 'epoch': '3.08'}
 43%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 2436/5680 [6:33:05<7:05:14,  7.87s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2437/5680 [6:33:13<7:05:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4879', 'grad_norm': '0.3241', 'learning_rate': '0.0001222', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 19963904, 'tokens/trainable': 19758252, 'epoch': '3.081'}
 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2437/5680 [6:33:13<7:05:05,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2438/5680 [6:33:21<7:05:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7496', 'grad_norm': '0.3883', 'learning_rate': '0.0001221', 'ppl': '2.116', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 19972096, 'tokens/trainable': 19766404, 'epoch': '3.081'}
 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2438/5680 [6:33:21<7:05:14,  7.87s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2439/5680 [6:33:29<7:04:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4738', 'grad_norm': '0.3653', 'learning_rate': '0.0001221', 'ppl': '1.606', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 19980288, 'tokens/trainable': 19774528, 'epoch': '3.081'}
 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2439/5680 [6:33:29<7:04:51,  7.87s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2440/5680 [6:33:36<7:05:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3277', 'grad_norm': '0.2629', 'learning_rate': '0.000122', 'ppl': '1.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 19988480, 'tokens/trainable': 19782692, 'epoch': '3.081'}
 43%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                             | 2440/5680 [6:33:36<7:05:10,  7.87s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2441/5680 [6:33:44<7:04:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4283', 'grad_norm': '0.2842', 'learning_rate': '0.0001219', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 19996672, 'tokens/trainable': 19790848, 'epoch': '3.081'}
 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2441/5680 [6:33:44<7:04:17,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2442/5680 [6:33:52<7:08:38,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.768', 'grad_norm': '0.3071', 'learning_rate': '0.0001219', 'ppl': '2.155', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 20004864, 'tokens/trainable': 19798984, 'epoch': '3.082'}
 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2442/5680 [6:33:52<7:08:38,  7.94s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2443/5680 [6:34:00<7:07:02,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9383', 'grad_norm': '0.6656', 'learning_rate': '0.0001218', 'ppl': '2.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 20013056, 'tokens/trainable': 19807126, 'epoch': '3.082'}
 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2443/5680 [6:34:00<7:07:02,  7.92s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2444/5680 [6:34:08<7:06:12,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6745', 'grad_norm': '0.3673', 'learning_rate': '0.0001218', 'ppl': '1.963', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 20021248, 'tokens/trainable': 19815234, 'epoch': '3.082'}
 43%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 2444/5680 [6:34:08<7:06:12,  7.90s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2445/5680 [6:34:16<7:05:36,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5781', 'grad_norm': '0.3536', 'learning_rate': '0.0001217', 'ppl': '1.783', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 20029440, 'tokens/trainable': 19823310, 'epoch': '3.082'}
 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2445/5680 [6:34:16<7:05:36,  7.89s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2446/5680 [6:34:24<7:04:48,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6752', 'grad_norm': '0.3018', 'learning_rate': '0.0001217', 'ppl': '1.964', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20037632, 'tokens/trainable': 19831406, 'epoch': '3.082'}
 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2446/5680 [6:34:24<7:04:48,  7.88s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2447/5680 [6:34:32<7:03:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8341', 'grad_norm': '0.4033', 'learning_rate': '0.0001216', 'ppl': '2.303', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1048', 'tokens/total': 20045824, 'tokens/trainable': 19839590, 'epoch': '3.082'}
 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2447/5680 [6:34:32<7:03:29,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2448/5680 [6:34:40<7:03:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.556', 'grad_norm': '0.264', 'learning_rate': '0.0001216', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 20054016, 'tokens/trainable': 19847548, 'epoch': '3.083'}
 43%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 2448/5680 [6:34:40<7:03:12,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                             | 2449/5680 [6:34:47<7:02:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5437', 'grad_norm': '0.3093', 'learning_rate': '0.0001215', 'ppl': '1.722', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 20062208, 'tokens/trainable': 19855702, 'epoch': '3.083'}
 43%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                             | 2449/5680 [6:34:47<7:02:31,  7.85s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                             | 2450/5680 [6:34:55<7:03:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3995', 'grad_norm': '0.2773', 'learning_rate': '0.0001215', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 20070400, 'tokens/trainable': 19863826, 'epoch': '3.083'}
 43%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                             | 2450/5680 [6:34:55<7:03:10,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                             | 2451/5680 [6:35:03<7:02:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7255', 'grad_norm': '0.3494', 'learning_rate': '0.0001214', 'ppl': '2.066', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 20078592, 'tokens/trainable': 19871828, 'epoch': '3.083'}
 43%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                             | 2451/5680 [6:35:03<7:02:32,  7.85s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2452/5680 [6:35:11<7:02:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5327', 'grad_norm': '0.2804', 'learning_rate': '0.0001213', 'ppl': '1.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 20086784, 'tokens/trainable': 19879976, 'epoch': '3.083'}
 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2452/5680 [6:35:11<7:02:38,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2453/5680 [6:35:19<7:02:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7573', 'grad_norm': '0.3235', 'learning_rate': '0.0001213', 'ppl': '2.132', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 20094976, 'tokens/trainable': 19888126, 'epoch': '3.083'}
 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2453/5680 [6:35:19<7:02:47,  7.86s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2454/5680 [6:35:27<7:02:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.9537', 'grad_norm': '0.3518', 'learning_rate': '0.0001212', 'ppl': '2.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 20103168, 'tokens/trainable': 19896168, 'epoch': '3.084'}
 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2454/5680 [6:35:27<7:02:12,  7.85s/it] 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2455/5680 [6:35:35<7:01:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3812', 'grad_norm': '0.2471', 'learning_rate': '0.0001212', 'ppl': '1.464', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20111360, 'tokens/trainable': 19904254, 'epoch': '3.084'}
 43%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 2455/5680 [6:35:35<7:01:53,  7.85s/it] 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2456/5680 [6:35:42<7:02:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4178', 'grad_norm': '0.2852', 'learning_rate': '0.0001211', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 20119552, 'tokens/trainable': 19912420, 'epoch': '3.084'}
 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2456/5680 [6:35:42<7:02:35,  7.86s/it] 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2457/5680 [6:35:50<7:02:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5319', 'grad_norm': '0.2744', 'learning_rate': '0.0001211', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 20127744, 'tokens/trainable': 19920540, 'epoch': '3.084'}
 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2457/5680 [6:35:50<7:02:21,  7.86s/it] 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2458/5680 [6:35:58<7:01:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6692', 'grad_norm': '0.3078', 'learning_rate': '0.000121', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 20135936, 'tokens/trainable': 19928672, 'epoch': '3.084'}
 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2458/5680 [6:35:58<7:01:48,  7.85s/it] 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2459/5680 [6:36:06<7:01:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6581', 'grad_norm': '0.2776', 'learning_rate': '0.000121', 'ppl': '1.931', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 20144128, 'tokens/trainable': 19936852, 'epoch': '3.085'}
 43%|███████████████████████████████████████████████████████████████████████████████████                                                                                                             | 2459/5680 [6:36:06<7:01:19,  7.85s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 2460/5680 [6:36:14<7:01:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5603', 'grad_norm': '0.3088', 'learning_rate': '0.0001209', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20152320, 'tokens/trainable': 19944992, 'epoch': '3.085'}
 43%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 2460/5680 [6:36:14<7:01:27,  7.85s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 2461/5680 [6:36:22<7:02:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3709', 'grad_norm': '0.2867', 'learning_rate': '0.0001209', 'ppl': '1.449', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 20160512, 'tokens/trainable': 19953006, 'epoch': '3.085'}
 43%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 2461/5680 [6:36:22<7:02:06,  7.87s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 2462/5680 [6:36:30<7:01:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6557', 'grad_norm': '0.3019', 'learning_rate': '0.0001208', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20168704, 'tokens/trainable': 19961110, 'epoch': '3.085'}
 43%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 2462/5680 [6:36:30<7:01:22,  7.86s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2463/5680 [6:36:37<7:02:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3997', 'grad_norm': '0.2677', 'learning_rate': '0.0001208', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 20176896, 'tokens/trainable': 19969162, 'epoch': '3.085'}
 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2463/5680 [6:36:37<7:02:19,  7.88s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2464/5680 [6:36:45<7:02:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6592', 'grad_norm': '0.3674', 'learning_rate': '0.0001207', 'ppl': '1.933', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 20185088, 'tokens/trainable': 19977266, 'epoch': '3.085'}
 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2464/5680 [6:36:45<7:02:14,  7.88s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2465/5680 [6:36:53<7:01:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5643', 'grad_norm': '0.2932', 'learning_rate': '0.0001206', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 20193280, 'tokens/trainable': 19985426, 'epoch': '3.086'}
 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2465/5680 [6:36:53<7:01:56,  7.87s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2466/5680 [6:37:01<7:01:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5343', 'grad_norm': '0.3319', 'learning_rate': '0.0001206', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 20201472, 'tokens/trainable': 19993466, 'epoch': '3.086'}
 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 2466/5680 [6:37:01<7:01:43,  7.87s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2467/5680 [6:37:09<7:01:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5029', 'grad_norm': '0.3015', 'learning_rate': '0.0001205', 'ppl': '1.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 20209664, 'tokens/trainable': 20001540, 'epoch': '3.086'}
 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2467/5680 [6:37:09<7:01:29,  7.87s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2468/5680 [6:37:17<7:01:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6084', 'grad_norm': '0.3295', 'learning_rate': '0.0001205', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 20217856, 'tokens/trainable': 20009696, 'epoch': '3.086'}
 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2468/5680 [6:37:17<7:01:27,  7.87s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2469/5680 [6:37:25<7:01:36,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6203', 'grad_norm': '0.3571', 'learning_rate': '0.0001204', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 20226048, 'tokens/trainable': 20017718, 'epoch': '3.086'}
 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2469/5680 [6:37:25<7:01:36,  7.88s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2470/5680 [6:37:33<7:01:45,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6823', 'grad_norm': '0.2988', 'learning_rate': '0.0001204', 'ppl': '1.978', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 20234240, 'tokens/trainable': 20025836, 'epoch': '3.086'}
 43%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 2470/5680 [6:37:33<7:01:45,  7.88s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 2471/5680 [6:37:41<7:01:57,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7649', 'grad_norm': '0.317', 'learning_rate': '0.0001203', 'ppl': '2.149', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 20242432, 'tokens/trainable': 20033920, 'epoch': '3.087'}
 44%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 2471/5680 [6:37:41<7:01:57,  7.89s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 2472/5680 [6:37:48<7:01:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3762', 'grad_norm': '0.2856', 'learning_rate': '0.0001203', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20250624, 'tokens/trainable': 20042056, 'epoch': '3.087'}
 44%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 2472/5680 [6:37:48<7:01:24,  7.88s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 2473/5680 [6:37:56<7:01:55,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6668', 'grad_norm': '0.3486', 'learning_rate': '0.0001202', 'ppl': '1.948', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 20258816, 'tokens/trainable': 20050208, 'epoch': '3.087'}
 44%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 2473/5680 [6:37:56<7:01:55,  7.89s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2474/5680 [6:38:04<7:01:53,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5362', 'grad_norm': '0.2693', 'learning_rate': '0.0001202', 'ppl': '1.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 20267008, 'tokens/trainable': 20058320, 'epoch': '3.087'}
 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2474/5680 [6:38:04<7:01:53,  7.90s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2475/5680 [6:38:12<7:01:35,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6718', 'grad_norm': '0.3831', 'learning_rate': '0.0001201', 'ppl': '1.958', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 20275200, 'tokens/trainable': 20066432, 'epoch': '3.087'}
 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2475/5680 [6:38:12<7:01:35,  7.89s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2476/5680 [6:38:20<7:00:45,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5006', 'grad_norm': '0.2664', 'learning_rate': '0.0001201', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 20283392, 'tokens/trainable': 20074604, 'epoch': '3.087'}
 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2476/5680 [6:38:20<7:00:45,  7.88s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2477/5680 [6:38:28<7:00:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5297', 'grad_norm': '0.2996', 'learning_rate': '0.00012', 'ppl': '1.698', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 20291584, 'tokens/trainable': 20082778, 'epoch': '3.088'}
 44%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 2477/5680 [6:38:28<7:00:02,  7.87s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2478/5680 [6:38:36<6:59:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5393', 'grad_norm': '0.2924', 'learning_rate': '0.0001199', 'ppl': '1.715', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 20299776, 'tokens/trainable': 20090944, 'epoch': '3.088'}
 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2478/5680 [6:38:36<6:59:32,  7.86s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2479/5680 [6:38:43<6:59:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.661', 'grad_norm': '0.3238', 'learning_rate': '0.0001199', 'ppl': '1.937', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20307968, 'tokens/trainable': 20099050, 'epoch': '3.088'}
 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2479/5680 [6:38:43<6:59:28,  7.86s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2480/5680 [6:38:51<6:59:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5903', 'grad_norm': '0.3287', 'learning_rate': '0.0001198', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20316160, 'tokens/trainable': 20107148, 'epoch': '3.088'}
 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2480/5680 [6:38:51<6:59:15,  7.86s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2481/5680 [6:38:59<6:58:18,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5755', 'grad_norm': '0.3777', 'learning_rate': '0.0001198', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 20324352, 'tokens/trainable': 20115308, 'epoch': '3.088'}
 44%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 2481/5680 [6:38:59<6:58:18,  7.85s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2482/5680 [6:39:07<6:58:04,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7139', 'grad_norm': '0.3445', 'learning_rate': '0.0001197', 'ppl': '2.042', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 20332544, 'tokens/trainable': 20123280, 'epoch': '3.089'}
 44%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2482/5680 [6:39:07<6:58:04,  7.84s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2483/5680 [6:39:15<6:58:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6981', 'grad_norm': '0.3187', 'learning_rate': '0.0001197', 'ppl': '2.01', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 20340736, 'tokens/trainable': 20131396, 'epoch': '3.089'}
 44%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2483/5680 [6:39:15<6:58:30,  7.85s/it] 44%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2484/5680 [6:39:23<6:58:14,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.764', 'grad_norm': '0.3818', 'learning_rate': '0.0001196', 'ppl': '2.147', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 20348928, 'tokens/trainable': 20139572, 'epoch': '3.089'}
 44%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2484/5680 [6:39:23<6:58:14,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2485/5680 [6:39:31<7:03:31,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3723', 'grad_norm': '0.281', 'learning_rate': '0.0001196', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.5', 'tokens/total': 20357120, 'tokens/trainable': 20147724, 'epoch': '3.089'}
 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2485/5680 [6:39:31<7:03:31,  7.95s/it] 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2486/5680 [6:39:39<7:01:43,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4156', 'grad_norm': '0.3146', 'learning_rate': '0.0001195', 'ppl': '1.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20365312, 'tokens/trainable': 20155828, 'epoch': '3.089'}
 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2486/5680 [6:39:39<7:01:43,  7.92s/it] 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2487/5680 [6:39:47<7:01:04,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5651', 'grad_norm': '0.3763', 'learning_rate': '0.0001195', 'ppl': '1.76', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20373504, 'tokens/trainable': 20163980, 'epoch': '3.089'}
 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2487/5680 [6:39:47<7:01:04,  7.91s/it] 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2488/5680 [6:39:54<6:59:37,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7654', 'grad_norm': '0.3591', 'learning_rate': '0.0001194', 'ppl': '2.15', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 20381696, 'tokens/trainable': 20172140, 'epoch': '3.09'}
 44%|████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2488/5680 [6:39:54<6:59:37,  7.89s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2489/5680 [6:40:02<6:59:35,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.803', 'grad_norm': '0.4458', 'learning_rate': '0.0001193', 'ppl': '2.232', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20389888, 'tokens/trainable': 20180280, 'epoch': '3.09'}
 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2489/5680 [6:40:02<6:59:35,  7.89s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2490/5680 [6:40:10<6:59:01,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.708', 'grad_norm': '0.3248', 'learning_rate': '0.0001193', 'ppl': '2.03', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 20398080, 'tokens/trainable': 20188448, 'epoch': '3.09'}
 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2490/5680 [6:40:10<6:59:01,  7.88s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2491/5680 [6:40:18<6:58:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5198', 'grad_norm': '0.3368', 'learning_rate': '0.0001192', 'ppl': '1.682', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20406272, 'tokens/trainable': 20196584, 'epoch': '3.09'}
 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2491/5680 [6:40:18<6:58:35,  7.88s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2492/5680 [6:40:26<6:57:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7311', 'grad_norm': '0.3592', 'learning_rate': '0.0001192', 'ppl': '2.077', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 20414464, 'tokens/trainable': 20204728, 'epoch': '3.09'}
 44%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 2492/5680 [6:40:26<6:57:53,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2493/5680 [6:40:34<6:58:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5184', 'grad_norm': '0.3031', 'learning_rate': '0.0001191', 'ppl': '1.679', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 20422656, 'tokens/trainable': 20212892, 'epoch': '3.09'}
 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2493/5680 [6:40:34<6:58:14,  7.87s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2494/5680 [6:40:42<6:57:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6289', 'grad_norm': '0.3019', 'learning_rate': '0.0001191', 'ppl': '1.876', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 20430848, 'tokens/trainable': 20220920, 'epoch': '3.091'}
 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2494/5680 [6:40:42<6:57:24,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2495/5680 [6:40:49<6:56:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4521', 'grad_norm': '0.3316', 'learning_rate': '0.000119', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 20439040, 'tokens/trainable': 20229060, 'epoch': '3.091'}
 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2495/5680 [6:40:49<6:56:28,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2496/5680 [6:40:57<6:56:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5387', 'grad_norm': '0.2862', 'learning_rate': '0.000119', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20447232, 'tokens/trainable': 20237184, 'epoch': '3.091'}
 44%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2496/5680 [6:40:57<6:56:39,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 2497/5680 [6:41:05<6:56:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8034', 'grad_norm': '0.3341', 'learning_rate': '0.0001189', 'ppl': '2.233', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 20455424, 'tokens/trainable': 20245252, 'epoch': '3.091'}
 44%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 2497/5680 [6:41:05<6:56:31,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 2498/5680 [6:41:13<6:55:51,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4613', 'grad_norm': '0.2677', 'learning_rate': '0.0001189', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 20463616, 'tokens/trainable': 20253416, 'epoch': '3.091'}
 44%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 2498/5680 [6:41:13<6:55:51,  7.84s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 2499/5680 [6:41:21<6:56:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.9316', 'grad_norm': '0.3147', 'learning_rate': '0.0001188', 'ppl': '2.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 20471808, 'tokens/trainable': 20261594, 'epoch': '3.092'}
 44%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 2499/5680 [6:41:21<6:56:01,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2500/5680 [6:41:29<6:55:27,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.8775', 'grad_norm': '0.4125', 'learning_rate': '0.0001187', 'ppl': '2.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 20480000, 'tokens/trainable': 20269624, 'epoch': '3.092'}
 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2500/5680 [6:41:29<6:55:27,  7.84s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2501/5680 [6:41:37<6:55:24,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5891', 'grad_norm': '0.2916', 'learning_rate': '0.0001187', 'ppl': '1.802', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 20488192, 'tokens/trainable': 20277772, 'epoch': '3.092'}
 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2501/5680 [6:41:37<6:55:24,  7.84s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2502/5680 [6:41:44<6:54:49,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.835', 'grad_norm': '0.309', 'learning_rate': '0.0001186', 'ppl': '2.305', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 20496384, 'tokens/trainable': 20285786, 'epoch': '3.092'}
 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2502/5680 [6:41:44<6:54:49,  7.83s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2503/5680 [6:41:52<6:55:10,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.936', 'grad_norm': '0.3587', 'learning_rate': '0.0001186', 'ppl': '2.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 20504576, 'tokens/trainable': 20293776, 'epoch': '3.092'}
 44%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                           | 2503/5680 [6:41:52<6:55:10,  7.84s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2504/5680 [6:42:00<6:55:21,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5229', 'grad_norm': '0.2973', 'learning_rate': '0.0001185', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 20512768, 'tokens/trainable': 20301914, 'epoch': '3.092'}
 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2504/5680 [6:42:00<6:55:21,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2505/5680 [6:42:08<6:55:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5248', 'grad_norm': '0.3189', 'learning_rate': '0.0001185', 'ppl': '1.69', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 20520960, 'tokens/trainable': 20309938, 'epoch': '3.093'}
 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2505/5680 [6:42:08<6:55:35,  7.85s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2506/5680 [6:42:16<6:55:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4828', 'grad_norm': '0.2755', 'learning_rate': '0.0001184', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20529152, 'tokens/trainable': 20318086, 'epoch': '3.093'}
 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2506/5680 [6:42:16<6:55:46,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2507/5680 [6:42:24<6:55:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5753', 'grad_norm': '0.3644', 'learning_rate': '0.0001184', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20537344, 'tokens/trainable': 20326220, 'epoch': '3.093'}
 44%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 2507/5680 [6:42:24<6:55:51,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 2508/5680 [6:42:32<6:55:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.556', 'grad_norm': '0.2974', 'learning_rate': '0.0001183', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20545536, 'tokens/trainable': 20334338, 'epoch': '3.093'}
 44%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 2508/5680 [6:42:32<6:55:24,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 2509/5680 [6:42:39<6:55:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4833', 'grad_norm': '0.297', 'learning_rate': '0.0001183', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 20553728, 'tokens/trainable': 20342342, 'epoch': '3.093'}
 44%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 2509/5680 [6:42:39<6:55:14,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 2510/5680 [6:42:47<6:55:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3726', 'grad_norm': '0.3118', 'learning_rate': '0.0001182', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20561920, 'tokens/trainable': 20350486, 'epoch': '3.093'}
 44%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 2510/5680 [6:42:47<6:55:31,  7.86s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2511/5680 [6:42:55<6:55:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9718', 'grad_norm': '0.3549', 'learning_rate': '0.0001182', 'ppl': '2.643', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20570112, 'tokens/trainable': 20358630, 'epoch': '3.094'}
 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2511/5680 [6:42:55<6:55:28,  7.87s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2512/5680 [6:43:03<6:55:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5151', 'grad_norm': '0.337', 'learning_rate': '0.0001181', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 20578304, 'tokens/trainable': 20366710, 'epoch': '3.094'}
 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2512/5680 [6:43:03<6:55:36,  7.87s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2513/5680 [6:43:11<6:56:12,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6919', 'grad_norm': '0.3467', 'learning_rate': '0.000118', 'ppl': '1.997', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 20586496, 'tokens/trainable': 20374892, 'epoch': '3.094'}
 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2513/5680 [6:43:11<6:56:12,  7.89s/it] 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2514/5680 [6:43:19<6:56:20,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6533', 'grad_norm': '0.3981', 'learning_rate': '0.000118', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 20594688, 'tokens/trainable': 20382936, 'epoch': '3.094'}
 44%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                           | 2514/5680 [6:43:19<6:56:20,  7.89s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2515/5680 [6:43:27<6:56:16,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.8794', 'grad_norm': '0.3753', 'learning_rate': '0.0001179', 'ppl': '2.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 20602880, 'tokens/trainable': 20390984, 'epoch': '3.094'}
 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2515/5680 [6:43:27<6:56:16,  7.89s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2516/5680 [6:43:35<6:55:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6052', 'grad_norm': '0.2972', 'learning_rate': '0.0001179', 'ppl': '1.832', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 20611072, 'tokens/trainable': 20399130, 'epoch': '3.095'}
 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2516/5680 [6:43:35<6:55:12,  7.87s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2517/5680 [6:43:42<6:55:46,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5016', 'grad_norm': '0.3668', 'learning_rate': '0.0001178', 'ppl': '1.651', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 20619264, 'tokens/trainable': 20407280, 'epoch': '3.095'}
 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2517/5680 [6:43:42<6:55:46,  7.89s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2518/5680 [6:43:51<7:00:02,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6075', 'grad_norm': '0.3372', 'learning_rate': '0.0001178', 'ppl': '1.836', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.8', 'tokens/total': 20627456, 'tokens/trainable': 20415400, 'epoch': '3.095'}
 44%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 2518/5680 [6:43:51<7:00:02,  7.97s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 2519/5680 [6:43:59<6:58:23,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6564', 'grad_norm': '0.3711', 'learning_rate': '0.0001177', 'ppl': '1.928', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 20635648, 'tokens/trainable': 20423332, 'epoch': '3.095'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 2519/5680 [6:43:59<6:58:23,  7.94s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 2520/5680 [6:44:06<6:57:23,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5485', 'grad_norm': '0.3536', 'learning_rate': '0.0001177', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 20643840, 'tokens/trainable': 20431342, 'epoch': '3.095'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 2520/5680 [6:44:06<6:57:23,  7.93s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 2521/5680 [6:44:14<6:56:25,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.693', 'grad_norm': '0.3267', 'learning_rate': '0.0001176', 'ppl': '2', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20652032, 'tokens/trainable': 20439490, 'epoch': '3.095'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 2521/5680 [6:44:14<6:56:25,  7.91s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2522/5680 [6:44:22<6:55:38,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5362', 'grad_norm': '0.3158', 'learning_rate': '0.0001176', 'ppl': '1.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 20660224, 'tokens/trainable': 20447564, 'epoch': '3.096'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2522/5680 [6:44:22<6:55:38,  7.90s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2523/5680 [6:44:30<6:55:36,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4983', 'grad_norm': '0.3054', 'learning_rate': '0.0001175', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 20668416, 'tokens/trainable': 20455536, 'epoch': '3.096'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2523/5680 [6:44:30<6:55:36,  7.90s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2524/5680 [6:44:38<6:54:47,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5668', 'grad_norm': '0.3128', 'learning_rate': '0.0001174', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 20676608, 'tokens/trainable': 20463580, 'epoch': '3.096'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2524/5680 [6:44:38<6:54:47,  7.89s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2525/5680 [6:44:46<6:53:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5502', 'grad_norm': '0.3198', 'learning_rate': '0.0001174', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 20684800, 'tokens/trainable': 20471734, 'epoch': '3.096'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 2525/5680 [6:44:46<6:53:48,  7.87s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2526/5680 [6:44:54<6:53:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5936', 'grad_norm': '0.3349', 'learning_rate': '0.0001173', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 20692992, 'tokens/trainable': 20479876, 'epoch': '3.096'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2526/5680 [6:44:54<6:53:28,  7.87s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2527/5680 [6:45:01<6:53:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3456', 'grad_norm': '0.2892', 'learning_rate': '0.0001173', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20701184, 'tokens/trainable': 20487976, 'epoch': '3.096'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2527/5680 [6:45:01<6:53:14,  7.86s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2528/5680 [6:45:10<6:57:40,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4085', 'grad_norm': '0.3773', 'learning_rate': '0.0001172', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.4', 'tokens/total': 20709376, 'tokens/trainable': 20496032, 'epoch': '3.097'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2528/5680 [6:45:10<6:57:40,  7.95s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2529/5680 [6:45:17<6:55:38,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4605', 'grad_norm': '0.2592', 'learning_rate': '0.0001172', 'ppl': '1.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 20717568, 'tokens/trainable': 20503990, 'epoch': '3.097'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                          | 2529/5680 [6:45:17<6:55:38,  7.91s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2530/5680 [6:45:25<6:54:30,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.8314', 'grad_norm': '0.3733', 'learning_rate': '0.0001171', 'ppl': '2.297', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 20725760, 'tokens/trainable': 20512108, 'epoch': '3.097'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2530/5680 [6:45:25<6:54:30,  7.90s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2531/5680 [6:45:33<6:54:55,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6601', 'grad_norm': '0.3536', 'learning_rate': '0.0001171', 'ppl': '1.935', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.4', 'tokens/total': 20733952, 'tokens/trainable': 20519992, 'epoch': '3.097'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2531/5680 [6:45:33<6:54:55,  7.91s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2532/5680 [6:45:41<6:54:32,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6827', 'grad_norm': '0.3428', 'learning_rate': '0.000117', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20742144, 'tokens/trainable': 20528136, 'epoch': '3.097'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2532/5680 [6:45:41<6:54:32,  7.90s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2533/5680 [6:45:49<6:54:05,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6405', 'grad_norm': '0.3431', 'learning_rate': '0.000117', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 20750336, 'tokens/trainable': 20536228, 'epoch': '3.098'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 2533/5680 [6:45:49<6:54:05,  7.89s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 2534/5680 [6:45:57<6:53:17,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7175', 'grad_norm': '0.289', 'learning_rate': '0.0001169', 'ppl': '2.049', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20758528, 'tokens/trainable': 20544324, 'epoch': '3.098'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 2534/5680 [6:45:57<6:53:17,  7.88s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 2535/5680 [6:46:05<6:52:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4529', 'grad_norm': '0.3015', 'learning_rate': '0.0001168', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 20766720, 'tokens/trainable': 20552400, 'epoch': '3.098'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 2535/5680 [6:46:05<6:52:45,  7.87s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 2536/5680 [6:46:13<6:52:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6599', 'grad_norm': '0.3183', 'learning_rate': '0.0001168', 'ppl': '1.935', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 20774912, 'tokens/trainable': 20560432, 'epoch': '3.098'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 2536/5680 [6:46:13<6:52:28,  7.87s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2537/5680 [6:46:20<6:51:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6587', 'grad_norm': '0.3494', 'learning_rate': '0.0001167', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 20783104, 'tokens/trainable': 20568392, 'epoch': '3.098'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2537/5680 [6:46:20<6:51:57,  7.86s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2538/5680 [6:46:28<6:52:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.657', 'grad_norm': '0.3293', 'learning_rate': '0.0001167', 'ppl': '1.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 20791296, 'tokens/trainable': 20576560, 'epoch': '3.098'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2538/5680 [6:46:28<6:52:24,  7.88s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2539/5680 [6:46:36<6:51:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8248', 'grad_norm': '0.3365', 'learning_rate': '0.0001166', 'ppl': '2.281', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 20799488, 'tokens/trainable': 20584666, 'epoch': '3.099'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2539/5680 [6:46:36<6:51:39,  7.86s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2540/5680 [6:46:44<6:51:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6296', 'grad_norm': '0.3245', 'learning_rate': '0.0001166', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.2', 'tokens/total': 20807680, 'tokens/trainable': 20592340, 'epoch': '3.099'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                          | 2540/5680 [6:46:44<6:51:45,  7.87s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2541/5680 [6:46:52<6:51:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5795', 'grad_norm': '0.3728', 'learning_rate': '0.0001165', 'ppl': '1.785', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971', 'tokens/total': 20815872, 'tokens/trainable': 20599972, 'epoch': '3.099'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2541/5680 [6:46:52<6:51:32,  7.87s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2542/5680 [6:47:00<6:51:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4764', 'grad_norm': '0.342', 'learning_rate': '0.0001165', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20824064, 'tokens/trainable': 20608076, 'epoch': '3.099'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2542/5680 [6:47:00<6:51:16,  7.86s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2543/5680 [6:47:08<6:51:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5174', 'grad_norm': '0.3314', 'learning_rate': '0.0001164', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980', 'tokens/total': 20832256, 'tokens/trainable': 20615788, 'epoch': '3.099'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2543/5680 [6:47:08<6:51:14,  7.87s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2544/5680 [6:47:15<6:51:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6791', 'grad_norm': '0.3442', 'learning_rate': '0.0001164', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 20840448, 'tokens/trainable': 20623800, 'epoch': '3.099'}
 45%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 2544/5680 [6:47:15<6:51:25,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                          | 2545/5680 [6:47:23<6:50:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7152', 'grad_norm': '0.345', 'learning_rate': '0.0001163', 'ppl': '2.045', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 20848640, 'tokens/trainable': 20631928, 'epoch': '3.1'}
 45%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                          | 2545/5680 [6:47:23<6:50:44,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                          | 2546/5680 [6:47:31<6:51:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7145', 'grad_norm': '0.3791', 'learning_rate': '0.0001162', 'ppl': '2.043', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 20856832, 'tokens/trainable': 20639876, 'epoch': '3.1'}
 45%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                          | 2546/5680 [6:47:31<6:51:11,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                          | 2547/5680 [6:47:39<6:51:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6447', 'grad_norm': '0.3113', 'learning_rate': '0.0001162', 'ppl': '1.905', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 20865024, 'tokens/trainable': 20647770, 'epoch': '3.1'}
 45%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                          | 2547/5680 [6:47:39<6:51:15,  7.88s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2548/5680 [6:47:47<6:50:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6217', 'grad_norm': '0.2983', 'learning_rate': '0.0001161', 'ppl': '1.862', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 20873216, 'tokens/trainable': 20655760, 'epoch': '3.1'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2548/5680 [6:47:47<6:50:44,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2549/5680 [6:47:55<6:50:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '1.054', 'grad_norm': '0.3543', 'learning_rate': '0.0001161', 'ppl': '2.869', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.5', 'tokens/total': 20881408, 'tokens/trainable': 20663396, 'epoch': '3.1'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2549/5680 [6:47:55<6:50:24,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2550/5680 [6:48:03<6:50:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6501', 'grad_norm': '0.4159', 'learning_rate': '0.000116', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 20889600, 'tokens/trainable': 20671322, 'epoch': '3.101'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2550/5680 [6:48:03<6:50:26,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2551/5680 [6:48:11<6:50:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.477', 'grad_norm': '0.2848', 'learning_rate': '0.000116', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 20897792, 'tokens/trainable': 20679348, 'epoch': '3.101'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 2551/5680 [6:48:11<6:50:17,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2552/5680 [6:48:18<6:50:39,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5439', 'grad_norm': '0.2979', 'learning_rate': '0.0001159', 'ppl': '1.723', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 20905984, 'tokens/trainable': 20687494, 'epoch': '3.101'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2552/5680 [6:48:18<6:50:39,  7.88s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2553/5680 [6:48:26<6:50:01,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5671', 'grad_norm': '0.3222', 'learning_rate': '0.0001159', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 20914176, 'tokens/trainable': 20695340, 'epoch': '3.101'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2553/5680 [6:48:26<6:50:01,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2554/5680 [6:48:34<6:49:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6797', 'grad_norm': '0.3806', 'learning_rate': '0.0001158', 'ppl': '1.973', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 20922368, 'tokens/trainable': 20703268, 'epoch': '3.101'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2554/5680 [6:48:34<6:49:23,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2555/5680 [6:48:42<6:49:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3167', 'grad_norm': '0.2525', 'learning_rate': '0.0001158', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 20930560, 'tokens/trainable': 20711400, 'epoch': '3.101'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                         | 2555/5680 [6:48:42<6:49:30,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2556/5680 [6:48:50<6:49:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6691', 'grad_norm': '0.3105', 'learning_rate': '0.0001157', 'ppl': '1.952', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 20938752, 'tokens/trainable': 20719506, 'epoch': '3.102'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2556/5680 [6:48:50<6:49:15,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2557/5680 [6:48:58<6:48:47,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.413', 'grad_norm': '0.2684', 'learning_rate': '0.0001156', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 20946944, 'tokens/trainable': 20727630, 'epoch': '3.102'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2557/5680 [6:48:58<6:48:47,  7.85s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2558/5680 [6:49:06<6:49:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4322', 'grad_norm': '0.2747', 'learning_rate': '0.0001156', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 20955136, 'tokens/trainable': 20735688, 'epoch': '3.102'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2558/5680 [6:49:06<6:49:13,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2559/5680 [6:49:13<6:48:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4246', 'grad_norm': '0.3321', 'learning_rate': '0.0001155', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 20963328, 'tokens/trainable': 20743620, 'epoch': '3.102'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2559/5680 [6:49:13<6:48:54,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2560/5680 [6:49:21<6:49:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4797', 'grad_norm': '0.3098', 'learning_rate': '0.0001155', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 20971520, 'tokens/trainable': 20751544, 'epoch': '3.102'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2560/5680 [6:49:21<6:49:08,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2561/5680 [6:49:29<6:48:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5565', 'grad_norm': '0.2766', 'learning_rate': '0.0001154', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 20979712, 'tokens/trainable': 20759496, 'epoch': '3.102'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2561/5680 [6:49:29<6:48:43,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2562/5680 [6:49:37<6:48:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4292', 'grad_norm': '0.293', 'learning_rate': '0.0001154', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 20987904, 'tokens/trainable': 20767432, 'epoch': '3.103'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 2562/5680 [6:49:37<6:48:39,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2563/5680 [6:49:45<6:48:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5437', 'grad_norm': '0.3648', 'learning_rate': '0.0001153', 'ppl': '1.722', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.3', 'tokens/total': 20996096, 'tokens/trainable': 20775132, 'epoch': '3.103'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2563/5680 [6:49:45<6:48:32,  7.86s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2564/5680 [6:49:53<6:48:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7722', 'grad_norm': '0.3298', 'learning_rate': '0.0001153', 'ppl': '2.164', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 21004288, 'tokens/trainable': 20783068, 'epoch': '3.103'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2564/5680 [6:49:53<6:48:45,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2565/5680 [6:50:01<6:48:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.718', 'grad_norm': '0.3959', 'learning_rate': '0.0001152', 'ppl': '2.05', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 21012480, 'tokens/trainable': 20791198, 'epoch': '3.103'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2565/5680 [6:50:01<6:48:21,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2566/5680 [6:50:09<6:48:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3847', 'grad_norm': '0.2707', 'learning_rate': '0.0001152', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.6', 'tokens/total': 21020672, 'tokens/trainable': 20798942, 'epoch': '3.103'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                         | 2566/5680 [6:50:09<6:48:21,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2567/5680 [6:50:16<6:48:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5368', 'grad_norm': '0.3199', 'learning_rate': '0.0001151', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 21028864, 'tokens/trainable': 20806870, 'epoch': '3.104'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2567/5680 [6:50:16<6:48:40,  7.88s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2568/5680 [6:50:24<6:48:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7526', 'grad_norm': '0.3318', 'learning_rate': '0.000115', 'ppl': '2.123', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 21037056, 'tokens/trainable': 20814910, 'epoch': '3.104'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2568/5680 [6:50:24<6:48:34,  7.88s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2569/5680 [6:50:32<6:48:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7054', 'grad_norm': '0.3535', 'learning_rate': '0.000115', 'ppl': '2.025', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.1', 'tokens/total': 21045248, 'tokens/trainable': 20822672, 'epoch': '3.104'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2569/5680 [6:50:32<6:48:13,  7.87s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2570/5680 [6:50:40<6:48:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4697', 'grad_norm': '0.2674', 'learning_rate': '0.0001149', 'ppl': '1.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21053440, 'tokens/trainable': 20830848, 'epoch': '3.104'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2570/5680 [6:50:40<6:48:14,  7.88s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                         | 2571/5680 [6:50:48<6:52:16,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5598', 'grad_norm': '0.3173', 'learning_rate': '0.0001149', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 21061632, 'tokens/trainable': 20838998, 'epoch': '3.104'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                         | 2571/5680 [6:50:48<6:52:16,  7.96s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                         | 2572/5680 [6:50:56<6:51:24,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5149', 'grad_norm': '0.2756', 'learning_rate': '0.0001148', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 21069824, 'tokens/trainable': 20847008, 'epoch': '3.104'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                         | 2572/5680 [6:50:56<6:51:24,  7.94s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                         | 2573/5680 [6:51:04<6:49:42,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6232', 'grad_norm': '0.2865', 'learning_rate': '0.0001148', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 21078016, 'tokens/trainable': 20855088, 'epoch': '3.105'}
 45%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                         | 2573/5680 [6:51:04<6:49:42,  7.91s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2574/5680 [6:51:12<6:48:48,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6979', 'grad_norm': '0.2878', 'learning_rate': '0.0001147', 'ppl': '2.01', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 21086208, 'tokens/trainable': 20863108, 'epoch': '3.105'}
 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2574/5680 [6:51:12<6:48:48,  7.90s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2575/5680 [6:51:20<6:47:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4622', 'grad_norm': '0.3179', 'learning_rate': '0.0001147', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 21094400, 'tokens/trainable': 20871226, 'epoch': '3.105'}
 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2575/5680 [6:51:20<6:47:52,  7.88s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2576/5680 [6:51:28<6:47:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3528', 'grad_norm': '0.2766', 'learning_rate': '0.0001146', 'ppl': '1.423', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 21102592, 'tokens/trainable': 20879140, 'epoch': '3.105'}
 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2576/5680 [6:51:28<6:47:27,  7.88s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2577/5680 [6:51:35<6:46:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5381', 'grad_norm': '0.296', 'learning_rate': '0.0001145', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 21110784, 'tokens/trainable': 20887004, 'epoch': '3.105'}
 45%|███████████████████████████████████████████████████████████████████████████████████████                                                                                                         | 2577/5680 [6:51:35<6:46:38,  7.86s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2578/5680 [6:51:43<6:46:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6495', 'grad_norm': '0.4177', 'learning_rate': '0.0001145', 'ppl': '1.915', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 21118976, 'tokens/trainable': 20895122, 'epoch': '3.105'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2578/5680 [6:51:43<6:46:21,  7.86s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2579/5680 [6:51:51<6:46:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5126', 'grad_norm': '0.3125', 'learning_rate': '0.0001144', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 21127168, 'tokens/trainable': 20903284, 'epoch': '3.106'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2579/5680 [6:51:51<6:46:02,  7.86s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2580/5680 [6:51:59<6:45:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7512', 'grad_norm': '0.3315', 'learning_rate': '0.0001144', 'ppl': '2.119', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.1', 'tokens/total': 21135360, 'tokens/trainable': 20911032, 'epoch': '3.106'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2580/5680 [6:51:59<6:45:28,  7.85s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2581/5680 [6:52:07<6:45:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4548', 'grad_norm': '0.2647', 'learning_rate': '0.0001143', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 21143552, 'tokens/trainable': 20918922, 'epoch': '3.106'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 2581/5680 [6:52:07<6:45:42,  7.86s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 2582/5680 [6:52:15<6:45:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5186', 'grad_norm': '0.3186', 'learning_rate': '0.0001143', 'ppl': '1.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 21151744, 'tokens/trainable': 20926928, 'epoch': '3.106'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 2582/5680 [6:52:15<6:45:38,  7.86s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 2583/5680 [6:52:22<6:45:17,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5064', 'grad_norm': '0.2946', 'learning_rate': '0.0001142', 'ppl': '1.659', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 21159936, 'tokens/trainable': 20934932, 'epoch': '3.106'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 2583/5680 [6:52:22<6:45:17,  7.85s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 2584/5680 [6:52:30<6:45:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.497', 'grad_norm': '0.3458', 'learning_rate': '0.0001142', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 21168128, 'tokens/trainable': 20942912, 'epoch': '3.107'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 2584/5680 [6:52:30<6:45:21,  7.86s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2585/5680 [6:52:38<6:44:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4902', 'grad_norm': '0.3473', 'learning_rate': '0.0001141', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.3', 'tokens/total': 21176320, 'tokens/trainable': 20950634, 'epoch': '3.107'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2585/5680 [6:52:38<6:44:51,  7.85s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2586/5680 [6:52:46<6:45:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4904', 'grad_norm': '0.3911', 'learning_rate': '0.0001141', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.8', 'tokens/total': 21184512, 'tokens/trainable': 20958444, 'epoch': '3.107'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2586/5680 [6:52:46<6:45:00,  7.85s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2587/5680 [6:52:54<6:45:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7406', 'grad_norm': '0.4552', 'learning_rate': '0.000114', 'ppl': '2.097', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 21192704, 'tokens/trainable': 20966352, 'epoch': '3.107'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2587/5680 [6:52:54<6:45:45,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2588/5680 [6:53:02<6:45:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5272', 'grad_norm': '0.3624', 'learning_rate': '0.0001139', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.3', 'tokens/total': 21200896, 'tokens/trainable': 20974198, 'epoch': '3.107'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 2588/5680 [6:53:02<6:45:28,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2589/5680 [6:53:10<6:45:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5214', 'grad_norm': '0.2882', 'learning_rate': '0.0001139', 'ppl': '1.684', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 21209088, 'tokens/trainable': 20982250, 'epoch': '3.107'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2589/5680 [6:53:10<6:45:05,  7.86s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2590/5680 [6:53:17<6:44:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.9048', 'grad_norm': '0.417', 'learning_rate': '0.0001138', 'ppl': '2.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.3', 'tokens/total': 21217280, 'tokens/trainable': 20990044, 'epoch': '3.108'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2590/5680 [6:53:17<6:44:37,  7.86s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2591/5680 [6:53:25<6:44:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4151', 'grad_norm': '0.2973', 'learning_rate': '0.0001138', 'ppl': '1.514', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.6', 'tokens/total': 21225472, 'tokens/trainable': 20997884, 'epoch': '3.108'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2591/5680 [6:53:25<6:44:46,  7.86s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2592/5680 [6:53:33<6:44:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.468', 'grad_norm': '0.2807', 'learning_rate': '0.0001137', 'ppl': '1.597', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 21233664, 'tokens/trainable': 21006046, 'epoch': '3.108'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                        | 2592/5680 [6:53:33<6:44:47,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 2593/5680 [6:53:41<6:45:01,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4941', 'grad_norm': '0.2899', 'learning_rate': '0.0001137', 'ppl': '1.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.9', 'tokens/total': 21241856, 'tokens/trainable': 21013838, 'epoch': '3.108'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 2593/5680 [6:53:41<6:45:01,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 2594/5680 [6:53:49<6:44:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6309', 'grad_norm': '0.3169', 'learning_rate': '0.0001136', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 21250048, 'tokens/trainable': 21021748, 'epoch': '3.108'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 2594/5680 [6:53:49<6:44:51,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 2595/5680 [6:53:57<6:45:22,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5577', 'grad_norm': '0.38', 'learning_rate': '0.0001136', 'ppl': '1.747', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 21258240, 'tokens/trainable': 21029672, 'epoch': '3.108'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 2595/5680 [6:53:57<6:45:22,  7.88s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2596/5680 [6:54:05<6:44:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6306', 'grad_norm': '0.3825', 'learning_rate': '0.0001135', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.8', 'tokens/total': 21266432, 'tokens/trainable': 21037438, 'epoch': '3.109'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2596/5680 [6:54:05<6:44:38,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2597/5680 [6:54:13<6:44:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5558', 'grad_norm': '0.3157', 'learning_rate': '0.0001135', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 21274624, 'tokens/trainable': 21045518, 'epoch': '3.109'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2597/5680 [6:54:13<6:44:15,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2598/5680 [6:54:20<6:44:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3829', 'grad_norm': '0.3194', 'learning_rate': '0.0001134', 'ppl': '1.467', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 21282816, 'tokens/trainable': 21053388, 'epoch': '3.109'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2598/5680 [6:54:20<6:44:05,  7.87s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2599/5680 [6:54:28<6:43:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6024', 'grad_norm': '0.3536', 'learning_rate': '0.0001133', 'ppl': '1.827', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 21291008, 'tokens/trainable': 21061204, 'epoch': '3.109'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                        | 2599/5680 [6:54:28<6:43:30,  7.86s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2600/5680 [6:54:36<6:43:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4765', 'grad_norm': '0.3036', 'learning_rate': '0.0001133', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 21299200, 'tokens/trainable': 21069140, 'epoch': '3.109'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2600/5680 [6:54:36<6:43:10,  7.85s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2601/5680 [6:54:44<6:43:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6501', 'grad_norm': '0.4029', 'learning_rate': '0.0001132', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 21307392, 'tokens/trainable': 21077028, 'epoch': '3.11'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2601/5680 [6:54:44<6:43:05,  7.86s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2602/5680 [6:54:52<6:42:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3542', 'grad_norm': '0.287', 'learning_rate': '0.0001132', 'ppl': '1.425', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980.8', 'tokens/total': 21315584, 'tokens/trainable': 21084718, 'epoch': '3.11'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2602/5680 [6:54:52<6:42:46,  7.85s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2603/5680 [6:55:00<6:41:58,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5811', 'grad_norm': '0.3575', 'learning_rate': '0.0001131', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 21323776, 'tokens/trainable': 21092732, 'epoch': '3.11'}
 46%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                        | 2603/5680 [6:55:00<6:41:58,  7.84s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2604/5680 [6:55:08<6:42:21,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3785', 'grad_norm': '0.3788', 'learning_rate': '0.0001131', 'ppl': '1.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 21331968, 'tokens/trainable': 21100644, 'epoch': '3.11'}
 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2604/5680 [6:55:08<6:42:21,  7.85s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2605/5680 [6:55:15<6:41:46,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5373', 'grad_norm': '0.3329', 'learning_rate': '0.000113', 'ppl': '1.711', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.6', 'tokens/total': 21340160, 'tokens/trainable': 21108428, 'epoch': '3.11'}
 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2605/5680 [6:55:15<6:41:46,  7.84s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2606/5680 [6:55:23<6:42:14,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6032', 'grad_norm': '0.3198', 'learning_rate': '0.000113', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 21348352, 'tokens/trainable': 21116344, 'epoch': '3.11'}
 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2606/5680 [6:55:23<6:42:14,  7.85s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2607/5680 [6:55:31<6:42:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5059', 'grad_norm': '0.396', 'learning_rate': '0.0001129', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 21356544, 'tokens/trainable': 21124380, 'epoch': '3.111'}
 46%|████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 2607/5680 [6:55:31<6:42:33,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                       | 2608/5680 [6:55:39<6:42:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5664', 'grad_norm': '0.3982', 'learning_rate': '0.0001129', 'ppl': '1.762', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.7', 'tokens/total': 21364736, 'tokens/trainable': 21132120, 'epoch': '3.111'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                       | 2608/5680 [6:55:39<6:42:35,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                       | 2609/5680 [6:55:47<6:42:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4639', 'grad_norm': '0.3391', 'learning_rate': '0.0001128', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.1', 'tokens/total': 21372928, 'tokens/trainable': 21139832, 'epoch': '3.111'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                       | 2609/5680 [6:55:47<6:42:26,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                       | 2610/5680 [6:55:55<6:41:45,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4705', 'grad_norm': '0.2633', 'learning_rate': '0.0001127', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21381120, 'tokens/trainable': 21147944, 'epoch': '3.111'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                       | 2610/5680 [6:55:55<6:41:45,  7.85s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2611/5680 [6:56:03<6:41:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6736', 'grad_norm': '0.4165', 'learning_rate': '0.0001127', 'ppl': '1.961', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.7', 'tokens/total': 21389312, 'tokens/trainable': 21155698, 'epoch': '3.111'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2611/5680 [6:56:03<6:41:44,  7.85s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2612/5680 [6:56:10<6:41:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.636', 'grad_norm': '0.3601', 'learning_rate': '0.0001126', 'ppl': '1.889', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.7', 'tokens/total': 21397504, 'tokens/trainable': 21163500, 'epoch': '3.111'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2612/5680 [6:56:10<6:41:50,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2613/5680 [6:56:18<6:41:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4961', 'grad_norm': '0.2916', 'learning_rate': '0.0001126', 'ppl': '1.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.6', 'tokens/total': 21405696, 'tokens/trainable': 21171136, 'epoch': '3.112'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2613/5680 [6:56:18<6:41:43,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2614/5680 [6:56:26<6:46:32,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.539', 'grad_norm': '0.2983', 'learning_rate': '0.0001125', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '965.6', 'tokens/total': 21413888, 'tokens/trainable': 21179036, 'epoch': '3.112'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2614/5680 [6:56:26<6:46:32,  7.96s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2615/5680 [6:56:34<6:44:44,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.7365', 'grad_norm': '0.3118', 'learning_rate': '0.0001125', 'ppl': '2.089', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 21422080, 'tokens/trainable': 21186956, 'epoch': '3.112'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2615/5680 [6:56:34<6:44:44,  7.92s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2616/5680 [6:56:42<6:44:48,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4052', 'grad_norm': '0.3057', 'learning_rate': '0.0001124', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.3', 'tokens/total': 21430272, 'tokens/trainable': 21194796, 'epoch': '3.112'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2616/5680 [6:56:42<6:44:48,  7.93s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2617/5680 [6:56:50<6:43:14,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.9704', 'grad_norm': '0.3359', 'learning_rate': '0.0001124', 'ppl': '2.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.6', 'tokens/total': 21438464, 'tokens/trainable': 21202460, 'epoch': '3.112'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2617/5680 [6:56:50<6:43:14,  7.90s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2618/5680 [6:56:58<6:47:36,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7077', 'grad_norm': '0.3839', 'learning_rate': '0.0001123', 'ppl': '2.029', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.8', 'tokens/total': 21446656, 'tokens/trainable': 21210484, 'epoch': '3.112'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                       | 2618/5680 [6:56:58<6:47:36,  7.99s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 2619/5680 [6:57:06<6:45:09,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5378', 'grad_norm': '0.3023', 'learning_rate': '0.0001122', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 21454848, 'tokens/trainable': 21218648, 'epoch': '3.113'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 2619/5680 [6:57:06<6:45:09,  7.94s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 2620/5680 [6:57:14<6:43:52,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9535', 'grad_norm': '0.4168', 'learning_rate': '0.0001122', 'ppl': '2.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 21463040, 'tokens/trainable': 21226584, 'epoch': '3.113'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 2620/5680 [6:57:14<6:43:52,  7.92s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 2621/5680 [6:57:22<6:43:22,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4832', 'grad_norm': '0.3023', 'learning_rate': '0.0001121', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 21471232, 'tokens/trainable': 21234720, 'epoch': '3.113'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 2621/5680 [6:57:22<6:43:22,  7.91s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2622/5680 [6:57:30<6:42:27,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4408', 'grad_norm': '0.4048', 'learning_rate': '0.0001121', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 21479424, 'tokens/trainable': 21242656, 'epoch': '3.113'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2622/5680 [6:57:30<6:42:27,  7.90s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2623/5680 [6:57:38<6:41:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.391', 'grad_norm': '0.2805', 'learning_rate': '0.000112', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 21487616, 'tokens/trainable': 21250652, 'epoch': '3.113'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2623/5680 [6:57:38<6:41:19,  7.88s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2624/5680 [6:57:45<6:40:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4031', 'grad_norm': '0.2787', 'learning_rate': '0.000112', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.2', 'tokens/total': 21495808, 'tokens/trainable': 21258344, 'epoch': '3.114'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2624/5680 [6:57:45<6:40:52,  7.87s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2625/5680 [6:57:53<6:40:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3679', 'grad_norm': '0.2558', 'learning_rate': '0.0001119', 'ppl': '1.445', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 21504000, 'tokens/trainable': 21266424, 'epoch': '3.114'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 2625/5680 [6:57:53<6:40:24,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2626/5680 [6:58:01<6:40:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7013', 'grad_norm': '0.356', 'learning_rate': '0.0001119', 'ppl': '2.016', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 21512192, 'tokens/trainable': 21274328, 'epoch': '3.114'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2626/5680 [6:58:01<6:40:18,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2627/5680 [6:58:09<6:40:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5533', 'grad_norm': '0.3116', 'learning_rate': '0.0001118', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.1', 'tokens/total': 21520384, 'tokens/trainable': 21282176, 'epoch': '3.114'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2627/5680 [6:58:09<6:40:02,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2628/5680 [6:58:17<6:39:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4638', 'grad_norm': '0.2972', 'learning_rate': '0.0001118', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.1', 'tokens/total': 21528576, 'tokens/trainable': 21289932, 'epoch': '3.114'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2628/5680 [6:58:17<6:39:29,  7.85s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2629/5680 [6:58:25<6:40:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3908', 'grad_norm': '0.3218', 'learning_rate': '0.0001117', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '963.6', 'tokens/total': 21536768, 'tokens/trainable': 21297540, 'epoch': '3.114'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 2629/5680 [6:58:25<6:40:00,  7.87s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 2630/5680 [6:58:33<6:39:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4468', 'grad_norm': '0.3966', 'learning_rate': '0.0001116', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.5', 'tokens/total': 21544960, 'tokens/trainable': 21305224, 'epoch': '3.115'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 2630/5680 [6:58:33<6:39:33,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 2631/5680 [6:58:40<6:39:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8007', 'grad_norm': '0.3687', 'learning_rate': '0.0001116', 'ppl': '2.227', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 21553152, 'tokens/trainable': 21313280, 'epoch': '3.115'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 2631/5680 [6:58:40<6:39:25,  7.86s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 2632/5680 [6:58:48<6:38:54,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3383', 'grad_norm': '0.2838', 'learning_rate': '0.0001115', 'ppl': '1.403', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996', 'tokens/total': 21561344, 'tokens/trainable': 21321082, 'epoch': '3.115'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 2632/5680 [6:58:48<6:38:54,  7.85s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2633/5680 [6:58:56<6:38:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7119', 'grad_norm': '0.3104', 'learning_rate': '0.0001115', 'ppl': '2.038', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.9', 'tokens/total': 21569536, 'tokens/trainable': 21328870, 'epoch': '3.115'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2633/5680 [6:58:56<6:38:39,  7.85s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2634/5680 [6:59:04<6:38:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7577', 'grad_norm': '0.3375', 'learning_rate': '0.0001114', 'ppl': '2.133', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.1', 'tokens/total': 21577728, 'tokens/trainable': 21336544, 'epoch': '3.115'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2634/5680 [6:59:04<6:38:59,  7.86s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2635/5680 [6:59:12<6:39:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5865', 'grad_norm': '0.316', 'learning_rate': '0.0001114', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.5', 'tokens/total': 21585920, 'tokens/trainable': 21344216, 'epoch': '3.115'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2635/5680 [6:59:12<6:39:05,  7.86s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2636/5680 [6:59:20<6:38:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8963', 'grad_norm': '0.333', 'learning_rate': '0.0001113', 'ppl': '2.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 21594112, 'tokens/trainable': 21352078, 'epoch': '3.116'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                       | 2636/5680 [6:59:20<6:38:38,  7.86s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 2637/5680 [6:59:28<6:38:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.9814', 'grad_norm': '0.3297', 'learning_rate': '0.0001113', 'ppl': '2.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 21602304, 'tokens/trainable': 21360072, 'epoch': '3.116'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 2637/5680 [6:59:28<6:38:19,  7.85s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 2638/5680 [6:59:35<6:37:24,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6839', 'grad_norm': '0.3298', 'learning_rate': '0.0001112', 'ppl': '1.982', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '882.8', 'tokens/total': 21610496, 'tokens/trainable': 21366958, 'epoch': '3.116'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 2638/5680 [6:59:35<6:37:24,  7.84s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 2639/5680 [6:59:43<6:40:55,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4612', 'grad_norm': '0.3687', 'learning_rate': '0.0001111', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.5', 'tokens/total': 21618688, 'tokens/trainable': 21374668, 'epoch': '3.116'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 2639/5680 [6:59:43<6:40:55,  7.91s/it][2026-01-27 04:48:57,550] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:60403] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 04:48:57,648] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:60408] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 04:48:59,528] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:60408] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
[2026-01-27 04:48:59,534] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:60403] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][ATokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<03:32, 26.27 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:35, 57.19 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:04<00:58, 91.06 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:41, 126.40 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:33, 153.00 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:05<00:26, 190.73 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:06<00:22, 217.86 examples/s]
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:20, 239.60 examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:06<05:38, 16.45 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:06<02:25, 37.48 examples/s][ATokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:06<00:18, 257.96 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:07<00:16, 273.82 examples/s]
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:07<01:31, 58.65 examples/s][ATokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:15, 283.13 examples/s]
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:07<01:02, 84.35 examples/s][ATokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:07<00:15, 292.04 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:08<00:14, 297.94 examples/s]
Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:08<00:50, 101.38 examples/s][ATokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:14, 289.01 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:08<00:13, 311.31 examples/s]
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:09<00:42, 119.12 examples/s][ATokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:09<00:12, 308.13 examples/s]
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:09<00:34, 142.79 examples/s][ATokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:09<00:14, 265.98 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:09<00:11, 328.38 examples/s]
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:10<00:31, 151.37 examples/s][ATokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:10<00:11, 326.25 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:10<00:11, 322.50 examples/s]
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:10<00:27, 169.21 examples/s][ATokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:10<00:11, 313.61 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:11<00:10, 314.02 examples/s]
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:11<00:28, 162.88 examples/s][ATokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:11<00:10, 314.59 examples/s]
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:11<00:24, 186.59 examples/s][ATokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:11<00:10, 313.32 examples/s]
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:12<00:23, 191.48 examples/s][ATokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:12<00:09, 317.21 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:12<00:09, 319.83 examples/s]
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:12<00:22, 192.84 examples/s][ATokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:12<00:08, 319.19 examples/s]
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:13<00:21, 199.00 examples/s][ATokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:13<00:08, 317.86 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:13<00:08, 317.16 examples/s]
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:13<00:20, 196.95 examples/s][ATokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:13<00:07, 315.72 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:14<00:07, 323.55 examples/s]
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:14<00:20, 194.74 examples/s][ATokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:14<00:05, 385.47 examples/s]
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:14<00:20, 190.93 examples/s][ATokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:14<00:07, 291.65 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:15<00:06, 308.59 examples/s]
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:15<00:06, 309.66 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:15<00:20, 188.04 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:15<00:18, 202.75 examples/s][ATokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:15<00:06, 279.05 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:16<00:05, 323.16 examples/s]
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:16<00:16, 217.09 examples/s][ATokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:16<00:05, 326.23 examples/s]
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:16<00:16, 214.64 examples/s][ATokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:16<00:04, 319.43 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:17<00:04, 318.62 examples/s]
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:17<00:15, 212.85 examples/s][ATokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:17<00:04, 316.30 examples/s]
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:17<00:15, 214.74 examples/s][ATokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:17<00:03, 318.07 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:18<00:03, 311.28 examples/s]
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:18<00:15, 202.53 examples/s][ATokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:18<00:03, 322.98 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:18<00:02, 325.07 examples/s]
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:18<00:15, 198.80 examples/s][ATokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:19<00:02, 320.63 examples/s]
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:19<00:14, 204.58 examples/s][ATokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:19<00:02, 318.96 examples/s]
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:19<00:13, 211.07 examples/s][ATokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:19<00:01, 317.51 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:20<00:01, 310.23 examples/s]
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:20<00:13, 204.55 examples/s][ATokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:20<00:01, 318.98 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:20<00:00, 365.08 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:21<00:00, 307.91 examples/s]
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:21<00:14, 182.02 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:21<00:11, 215.17 examples/s][ATokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:21<00:00, 304.54 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:21<00:00, 347.61 examples/s]
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:21<00:11, 205.74 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:22<00:00, 255.85 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:22<00:11, 208.41 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:23<00:11, 190.31 examples/s][ADropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 913.08 examples/s]
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:23<00:10, 204.46 examples/s][ADropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1271.47 examples/s]
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:23<00:09, 208.48 examples/s][ADropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1504.97 examples/s]
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:24<00:09, 205.36 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:24<00:08, 220.01 examples/s][ADropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1669.31 examples/s]
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:25<00:08, 207.86 examples/s][ADropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1746.29 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1747.98 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1569.23 examples/s]
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:25<00:07, 210.46 examples/s][A
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:26<00:06, 221.58 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:26<00:06, 210.28 examples/s][AAdd position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1236.72 examples/s]
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:27<00:05, 214.62 examples/s][AAdd position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1910.92 examples/s]
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:27<00:05, 231.00 examples/s][AAdd position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2327.74 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2679.38 examples/s]
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:28<00:04, 233.39 examples/s][AAdd position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2793.55 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2384.41 examples/s]
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:29<00:05, 178.90 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:29<00:03, 226.72 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:29<00:02, 248.02 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:30<00:02, 232.67 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:30<00:02, 215.23 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:31<00:01, 225.76 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:31<00:01, 237.32 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:31<00:00, 226.96 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:32<00:00, 228.15 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:32<00:00, 249.93 examples/s][A
[2026-01-27 04:49:33,449] [WARNING] [py.warnings._showwarnmsg:109] [PID:60403] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:33<00:00, 167.84 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▌                                                                                                                                     | 1000/5677 [00:00<00:04, 1014.43 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1384.79 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1624.33 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:00, 1684.11 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1775.27 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1762.14 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1625.55 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1496.90 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:00<00:01, 2145.04 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2545.69 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2753.60 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:01<00:00, 2876.83 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:01<00:00, 2545.39 examples/s]
[2026-01-27 04:49:39,377] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:60408] Using single process for pack_parallel, running sequentially.
[2026-01-27 04:49:44,413] [WARNING] [py.warnings._showwarnmsg:109] [PID:60408] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2640/5680 [7:00:38<18:35:52, 22.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5585', 'grad_norm': '0.3616', 'learning_rate': '0.0001111', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 21626880, 'tokens/trainable': 21382816, 'epoch': '4'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2640/5680 [7:00:38<18:35:52, 22.02s/it] 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2641/5680 [7:00:46<14:59:55, 17.77s/it]                                                                                                                                                                                                                                             {'loss': '0.5997', 'grad_norm': '0.2725', 'learning_rate': '0.000111', 'ppl': '1.822', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 21635072, 'tokens/trainable': 21390976, 'epoch': '4'}
 46%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2641/5680 [7:00:46<14:59:55, 17.77s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2642/5680 [7:00:54<12:29:15, 14.80s/it]                                                                                                                                                                                                                                             {'loss': '0.6546', 'grad_norm': '0.3012', 'learning_rate': '0.000111', 'ppl': '1.924', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 21643264, 'tokens/trainable': 21399146, 'epoch': '4.001'}
 47%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2642/5680 [7:00:54<12:29:15, 14.80s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2643/5680 [7:01:02<10:43:27, 12.71s/it]                                                                                                                                                                                                                                             {'loss': '0.6706', 'grad_norm': '0.3925', 'learning_rate': '0.0001109', 'ppl': '1.955', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 21651456, 'tokens/trainable': 21407300, 'epoch': '4.001'}
 47%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2643/5680 [7:01:02<10:43:27, 12.71s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                      | 2644/5680 [7:01:10<9:29:43, 11.26s/it]                                                                                                                                                                                                                                             {'loss': '0.637', 'grad_norm': '0.3668', 'learning_rate': '0.0001109', 'ppl': '1.891', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 21659648, 'tokens/trainable': 21415428, 'epoch': '4.001'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                      | 2644/5680 [7:01:10<9:29:43, 11.26s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 2645/5680 [7:01:18<8:37:52, 10.24s/it]                                                                                                                                                                                                                                             {'loss': '0.6909', 'grad_norm': '0.3587', 'learning_rate': '0.0001108', 'ppl': '1.995', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 21667840, 'tokens/trainable': 21423602, 'epoch': '4.001'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 2645/5680 [7:01:18<8:37:52, 10.24s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 2646/5680 [7:01:25<8:01:42,  9.53s/it]                                                                                                                                                                                                                                             {'loss': '0.4974', 'grad_norm': '0.4072', 'learning_rate': '0.0001108', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 21676032, 'tokens/trainable': 21431768, 'epoch': '4.001'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 2646/5680 [7:01:25<8:01:42,  9.53s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 2647/5680 [7:01:33<7:36:47,  9.04s/it]                                                                                                                                                                                                                                             {'loss': '0.4551', 'grad_norm': '0.3835', 'learning_rate': '0.0001107', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 21684224, 'tokens/trainable': 21439908, 'epoch': '4.001'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 2647/5680 [7:01:33<7:36:47,  9.04s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2648/5680 [7:01:41<7:19:00,  8.69s/it]                                                                                                                                                                                                                                             {'loss': '0.6433', 'grad_norm': '0.3631', 'learning_rate': '0.0001107', 'ppl': '1.903', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21692416, 'tokens/trainable': 21448072, 'epoch': '4.002'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2648/5680 [7:01:41<7:19:00,  8.69s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2649/5680 [7:01:49<7:05:54,  8.43s/it]                                                                                                                                                                                                                                             {'loss': '0.6424', 'grad_norm': '0.395', 'learning_rate': '0.0001106', 'ppl': '1.901', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 21700608, 'tokens/trainable': 21456250, 'epoch': '4.002'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2649/5680 [7:01:49<7:05:54,  8.43s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2650/5680 [7:01:57<6:57:07,  8.26s/it]                                                                                                                                                                                                                                             {'loss': '0.5244', 'grad_norm': '0.3136', 'learning_rate': '0.0001105', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 21708800, 'tokens/trainable': 21464422, 'epoch': '4.002'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2650/5680 [7:01:57<6:57:07,  8.26s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2651/5680 [7:02:05<6:51:26,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.649', 'grad_norm': '0.3428', 'learning_rate': '0.0001105', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 21716992, 'tokens/trainable': 21472576, 'epoch': '4.002'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 2651/5680 [7:02:05<6:51:26,  8.15s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2652/5680 [7:02:13<6:46:51,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.4621', 'grad_norm': '0.3207', 'learning_rate': '0.0001104', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 21725184, 'tokens/trainable': 21480744, 'epoch': '4.002'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2652/5680 [7:02:13<6:46:51,  8.06s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2653/5680 [7:02:21<6:43:11,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4883', 'grad_norm': '0.2798', 'learning_rate': '0.0001104', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 21733376, 'tokens/trainable': 21488886, 'epoch': '4.002'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2653/5680 [7:02:21<6:43:11,  7.99s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2654/5680 [7:02:28<6:41:22,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.605', 'grad_norm': '0.4053', 'learning_rate': '0.0001103', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 21741568, 'tokens/trainable': 21497004, 'epoch': '4.003'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2654/5680 [7:02:28<6:41:22,  7.96s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2655/5680 [7:02:36<6:40:07,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.471', 'grad_norm': '0.3044', 'learning_rate': '0.0001103', 'ppl': '1.602', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 21749760, 'tokens/trainable': 21505186, 'epoch': '4.003'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2655/5680 [7:02:36<6:40:07,  7.94s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2656/5680 [7:02:44<6:39:34,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3995', 'grad_norm': '0.317', 'learning_rate': '0.0001102', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 21757952, 'tokens/trainable': 21513372, 'epoch': '4.003'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2656/5680 [7:02:44<6:39:34,  7.93s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2657/5680 [7:02:52<6:43:01,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.8099', 'grad_norm': '0.3682', 'learning_rate': '0.0001102', 'ppl': '2.248', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.6', 'tokens/total': 21766144, 'tokens/trainable': 21521532, 'epoch': '4.003'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2657/5680 [7:02:52<6:43:01,  8.00s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2658/5680 [7:03:00<6:40:43,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3477', 'grad_norm': '0.2811', 'learning_rate': '0.0001101', 'ppl': '1.416', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 21774336, 'tokens/trainable': 21529700, 'epoch': '4.003'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 2658/5680 [7:03:00<6:40:43,  7.96s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2659/5680 [7:03:08<6:38:58,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5004', 'grad_norm': '0.3786', 'learning_rate': '0.00011', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 21782528, 'tokens/trainable': 21537848, 'epoch': '4.004'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2659/5680 [7:03:08<6:38:58,  7.92s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2660/5680 [7:03:16<6:37:41,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4894', 'grad_norm': '0.2971', 'learning_rate': '0.00011', 'ppl': '1.631', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 21790720, 'tokens/trainable': 21545976, 'epoch': '4.004'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2660/5680 [7:03:16<6:37:41,  7.90s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2661/5680 [7:03:24<6:37:04,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5932', 'grad_norm': '0.3043', 'learning_rate': '0.0001099', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21798912, 'tokens/trainable': 21554132, 'epoch': '4.004'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2661/5680 [7:03:24<6:37:04,  7.89s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2662/5680 [7:03:32<6:35:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9467', 'grad_norm': '0.4757', 'learning_rate': '0.0001099', 'ppl': '2.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 21807104, 'tokens/trainable': 21562294, 'epoch': '4.004'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 2662/5680 [7:03:32<6:35:40,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2663/5680 [7:03:39<6:35:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4553', 'grad_norm': '0.4003', 'learning_rate': '0.0001098', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 21815296, 'tokens/trainable': 21570430, 'epoch': '4.004'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2663/5680 [7:03:39<6:35:24,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2664/5680 [7:03:47<6:35:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5423', 'grad_norm': '0.344', 'learning_rate': '0.0001098', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 21823488, 'tokens/trainable': 21578600, 'epoch': '4.004'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2664/5680 [7:03:47<6:35:37,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2665/5680 [7:03:55<6:35:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5259', 'grad_norm': '0.3177', 'learning_rate': '0.0001097', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 21831680, 'tokens/trainable': 21586780, 'epoch': '4.005'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2665/5680 [7:03:55<6:35:16,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2666/5680 [7:04:03<6:35:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.473', 'grad_norm': '0.2997', 'learning_rate': '0.0001097', 'ppl': '1.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 21839872, 'tokens/trainable': 21594968, 'epoch': '4.005'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2666/5680 [7:04:03<6:35:10,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 2667/5680 [7:04:11<6:35:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4338', 'grad_norm': '0.2927', 'learning_rate': '0.0001096', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 21848064, 'tokens/trainable': 21603100, 'epoch': '4.005'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 2667/5680 [7:04:11<6:35:15,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 2668/5680 [7:04:19<6:34:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5235', 'grad_norm': '0.3419', 'learning_rate': '0.0001096', 'ppl': '1.688', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 21856256, 'tokens/trainable': 21611252, 'epoch': '4.005'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 2668/5680 [7:04:19<6:34:40,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 2669/5680 [7:04:27<6:34:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.543', 'grad_norm': '0.297', 'learning_rate': '0.0001095', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 21864448, 'tokens/trainable': 21619428, 'epoch': '4.005'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 2669/5680 [7:04:27<6:34:01,  7.85s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2670/5680 [7:04:34<6:34:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3263', 'grad_norm': '0.291', 'learning_rate': '0.0001094', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 21872640, 'tokens/trainable': 21627564, 'epoch': '4.005'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2670/5680 [7:04:34<6:34:10,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2671/5680 [7:04:42<6:33:45,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6335', 'grad_norm': '0.3601', 'learning_rate': '0.0001094', 'ppl': '1.884', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 21880832, 'tokens/trainable': 21635684, 'epoch': '4.006'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2671/5680 [7:04:42<6:33:45,  7.85s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2672/5680 [7:04:50<6:34:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6045', 'grad_norm': '0.4054', 'learning_rate': '0.0001093', 'ppl': '1.83', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 21889024, 'tokens/trainable': 21643792, 'epoch': '4.006'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2672/5680 [7:04:50<6:34:15,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2673/5680 [7:04:58<6:33:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5948', 'grad_norm': '0.328', 'learning_rate': '0.0001093', 'ppl': '1.813', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 21897216, 'tokens/trainable': 21651950, 'epoch': '4.006'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 2673/5680 [7:04:58<6:33:46,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2674/5680 [7:05:06<6:34:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5098', 'grad_norm': '0.2919', 'learning_rate': '0.0001092', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 21905408, 'tokens/trainable': 21660070, 'epoch': '4.006'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2674/5680 [7:05:06<6:34:30,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2675/5680 [7:05:14<6:34:28,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5826', 'grad_norm': '0.3687', 'learning_rate': '0.0001092', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21913600, 'tokens/trainable': 21668242, 'epoch': '4.006'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2675/5680 [7:05:14<6:34:28,  7.88s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2676/5680 [7:05:22<6:33:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.581', 'grad_norm': '0.3366', 'learning_rate': '0.0001091', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 21921792, 'tokens/trainable': 21676336, 'epoch': '4.007'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2676/5680 [7:05:22<6:33:52,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2677/5680 [7:05:30<6:33:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6465', 'grad_norm': '0.3018', 'learning_rate': '0.0001091', 'ppl': '1.909', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 21929984, 'tokens/trainable': 21684496, 'epoch': '4.007'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 2677/5680 [7:05:30<6:33:13,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                     | 2678/5680 [7:05:37<6:33:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6133', 'grad_norm': '0.3872', 'learning_rate': '0.000109', 'ppl': '1.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 21938176, 'tokens/trainable': 21692626, 'epoch': '4.007'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                     | 2678/5680 [7:05:37<6:33:22,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                     | 2679/5680 [7:05:45<6:33:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6558', 'grad_norm': '0.3623', 'learning_rate': '0.0001089', 'ppl': '1.927', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 21946368, 'tokens/trainable': 21700720, 'epoch': '4.007'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                     | 2679/5680 [7:05:45<6:33:12,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                     | 2680/5680 [7:05:53<6:33:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.451', 'grad_norm': '0.3103', 'learning_rate': '0.0001089', 'ppl': '1.57', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21954560, 'tokens/trainable': 21708884, 'epoch': '4.007'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                     | 2680/5680 [7:05:53<6:33:18,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2681/5680 [7:06:01<6:32:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5451', 'grad_norm': '0.295', 'learning_rate': '0.0001088', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 21962752, 'tokens/trainable': 21717044, 'epoch': '4.007'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2681/5680 [7:06:01<6:32:58,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2682/5680 [7:06:09<6:33:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3789', 'grad_norm': '0.2882', 'learning_rate': '0.0001088', 'ppl': '1.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 21970944, 'tokens/trainable': 21725166, 'epoch': '4.008'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2682/5680 [7:06:09<6:33:12,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2683/5680 [7:06:17<6:33:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6101', 'grad_norm': '0.3401', 'learning_rate': '0.0001087', 'ppl': '1.841', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 21979136, 'tokens/trainable': 21733300, 'epoch': '4.008'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2683/5680 [7:06:17<6:33:04,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2684/5680 [7:06:25<6:32:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7767', 'grad_norm': '0.3654', 'learning_rate': '0.0001087', 'ppl': '2.174', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 21987328, 'tokens/trainable': 21741436, 'epoch': '4.008'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2684/5680 [7:06:25<6:32:34,  7.86s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2685/5680 [7:06:32<6:31:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6474', 'grad_norm': '0.3163', 'learning_rate': '0.0001086', 'ppl': '1.911', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 21995520, 'tokens/trainable': 21749610, 'epoch': '4.008'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2685/5680 [7:06:32<6:31:58,  7.85s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2686/5680 [7:06:40<6:32:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.502', 'grad_norm': '0.3449', 'learning_rate': '0.0001086', 'ppl': '1.652', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22003712, 'tokens/trainable': 21757740, 'epoch': '4.008'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2686/5680 [7:06:40<6:32:28,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2687/5680 [7:06:48<6:33:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.478', 'grad_norm': '0.3165', 'learning_rate': '0.0001085', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22011904, 'tokens/trainable': 21765924, 'epoch': '4.008'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2687/5680 [7:06:48<6:33:00,  7.88s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2688/5680 [7:06:56<6:32:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5089', 'grad_norm': '0.4306', 'learning_rate': '0.0001085', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 22020096, 'tokens/trainable': 21774112, 'epoch': '4.009'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2688/5680 [7:06:56<6:32:20,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2689/5680 [7:07:04<6:32:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4569', 'grad_norm': '0.345', 'learning_rate': '0.0001084', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22028288, 'tokens/trainable': 21782266, 'epoch': '4.009'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2689/5680 [7:07:04<6:32:10,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2690/5680 [7:07:12<6:32:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7473', 'grad_norm': '0.3445', 'learning_rate': '0.0001083', 'ppl': '2.111', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22036480, 'tokens/trainable': 21790416, 'epoch': '4.009'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2690/5680 [7:07:12<6:32:08,  7.87s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2691/5680 [7:07:20<6:32:28,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7523', 'grad_norm': '0.331', 'learning_rate': '0.0001083', 'ppl': '2.122', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 22044672, 'tokens/trainable': 21798556, 'epoch': '4.009'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2691/5680 [7:07:20<6:32:28,  7.88s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2692/5680 [7:07:28<6:32:28,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4841', 'grad_norm': '0.3018', 'learning_rate': '0.0001082', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22052864, 'tokens/trainable': 21806720, 'epoch': '4.009'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 2692/5680 [7:07:28<6:32:28,  7.88s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 2693/5680 [7:07:35<6:31:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6282', 'grad_norm': '0.3168', 'learning_rate': '0.0001082', 'ppl': '1.874', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22061056, 'tokens/trainable': 21814896, 'epoch': '4.01'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 2693/5680 [7:07:35<6:31:46,  7.87s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 2694/5680 [7:07:43<6:31:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6921', 'grad_norm': '0.3684', 'learning_rate': '0.0001081', 'ppl': '1.998', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 22069248, 'tokens/trainable': 21823040, 'epoch': '4.01'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 2694/5680 [7:07:43<6:31:54,  7.87s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 2695/5680 [7:07:51<6:31:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7575', 'grad_norm': '0.3763', 'learning_rate': '0.0001081', 'ppl': '2.133', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22077440, 'tokens/trainable': 21831200, 'epoch': '4.01'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 2695/5680 [7:07:51<6:31:11,  7.86s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2696/5680 [7:07:59<6:31:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.557', 'grad_norm': '0.2931', 'learning_rate': '0.000108', 'ppl': '1.745', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 22085632, 'tokens/trainable': 21839352, 'epoch': '4.01'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2696/5680 [7:07:59<6:31:27,  7.87s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2697/5680 [7:08:07<6:30:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4949', 'grad_norm': '0.2926', 'learning_rate': '0.000108', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 22093824, 'tokens/trainable': 21847544, 'epoch': '4.01'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2697/5680 [7:08:07<6:30:54,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2698/5680 [7:08:15<6:30:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3994', 'grad_norm': '0.3249', 'learning_rate': '0.0001079', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22102016, 'tokens/trainable': 21855720, 'epoch': '4.01'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2698/5680 [7:08:15<6:30:51,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2699/5680 [7:08:23<6:30:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4997', 'grad_norm': '0.3938', 'learning_rate': '0.0001078', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22110208, 'tokens/trainable': 21863848, 'epoch': '4.011'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 2699/5680 [7:08:23<6:30:38,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2700/5680 [7:08:30<6:30:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3933', 'grad_norm': '0.2541', 'learning_rate': '0.0001078', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22118400, 'tokens/trainable': 21872008, 'epoch': '4.011'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2700/5680 [7:08:31<6:30:33,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2701/5680 [7:08:38<6:30:41,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5277', 'grad_norm': '0.3248', 'learning_rate': '0.0001077', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22126592, 'tokens/trainable': 21880168, 'epoch': '4.011'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2701/5680 [7:08:38<6:30:41,  7.87s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2702/5680 [7:08:46<6:29:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2744', 'grad_norm': '0.2781', 'learning_rate': '0.0001077', 'ppl': '1.316', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22134784, 'tokens/trainable': 21888292, 'epoch': '4.011'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2702/5680 [7:08:46<6:29:51,  7.85s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2703/5680 [7:08:54<6:28:57,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6083', 'grad_norm': '0.3482', 'learning_rate': '0.0001076', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22142976, 'tokens/trainable': 21896424, 'epoch': '4.011'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 2703/5680 [7:08:54<6:28:57,  7.84s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 2704/5680 [7:09:02<6:29:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4142', 'grad_norm': '0.3605', 'learning_rate': '0.0001076', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22151168, 'tokens/trainable': 21904586, 'epoch': '4.011'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 2704/5680 [7:09:02<6:29:27,  7.85s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 2705/5680 [7:09:10<6:34:12,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5384', 'grad_norm': '0.3639', 'learning_rate': '0.0001075', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 22159360, 'tokens/trainable': 21912776, 'epoch': '4.012'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 2705/5680 [7:09:10<6:34:12,  7.95s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 2706/5680 [7:09:18<6:33:26,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4855', 'grad_norm': '0.3005', 'learning_rate': '0.0001075', 'ppl': '1.625', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22167552, 'tokens/trainable': 21920916, 'epoch': '4.012'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 2706/5680 [7:09:18<6:33:26,  7.94s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2707/5680 [7:09:26<6:31:45,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6879', 'grad_norm': '0.3458', 'learning_rate': '0.0001074', 'ppl': '1.99', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22175744, 'tokens/trainable': 21929062, 'epoch': '4.012'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2707/5680 [7:09:26<6:31:45,  7.91s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2708/5680 [7:09:34<6:30:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6759', 'grad_norm': '0.3353', 'learning_rate': '0.0001073', 'ppl': '1.966', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22183936, 'tokens/trainable': 21937184, 'epoch': '4.012'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2708/5680 [7:09:34<6:30:14,  7.88s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2709/5680 [7:09:41<6:29:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.574', 'grad_norm': '0.3933', 'learning_rate': '0.0001073', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22192128, 'tokens/trainable': 21945300, 'epoch': '4.012'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2709/5680 [7:09:41<6:29:30,  7.87s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2710/5680 [7:09:49<6:28:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6117', 'grad_norm': '0.3986', 'learning_rate': '0.0001072', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22200320, 'tokens/trainable': 21953416, 'epoch': '4.013'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 2710/5680 [7:09:49<6:28:57,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2711/5680 [7:09:57<6:28:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5573', 'grad_norm': '0.3678', 'learning_rate': '0.0001072', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22208512, 'tokens/trainable': 21961558, 'epoch': '4.013'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2711/5680 [7:09:57<6:28:50,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2712/5680 [7:10:05<6:28:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3958', 'grad_norm': '0.3035', 'learning_rate': '0.0001071', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 22216704, 'tokens/trainable': 21969672, 'epoch': '4.013'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2712/5680 [7:10:05<6:28:35,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2713/5680 [7:10:13<6:28:17,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8314', 'grad_norm': '0.3706', 'learning_rate': '0.0001071', 'ppl': '2.296', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22224896, 'tokens/trainable': 21977818, 'epoch': '4.013'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2713/5680 [7:10:13<6:28:17,  7.85s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2714/5680 [7:10:21<6:28:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.497', 'grad_norm': '0.3192', 'learning_rate': '0.000107', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 22233088, 'tokens/trainable': 21985994, 'epoch': '4.013'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 2714/5680 [7:10:21<6:28:15,  7.85s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 2715/5680 [7:10:29<6:28:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6729', 'grad_norm': '0.4713', 'learning_rate': '0.000107', 'ppl': '1.96', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22241280, 'tokens/trainable': 21994144, 'epoch': '4.013'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 2715/5680 [7:10:29<6:28:13,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 2716/5680 [7:10:36<6:28:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.559', 'grad_norm': '0.3757', 'learning_rate': '0.0001069', 'ppl': '1.749', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22249472, 'tokens/trainable': 22002246, 'epoch': '4.014'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 2716/5680 [7:10:36<6:28:16,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 2717/5680 [7:10:44<6:28:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4916', 'grad_norm': '0.3761', 'learning_rate': '0.0001069', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 22257664, 'tokens/trainable': 22010340, 'epoch': '4.014'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 2717/5680 [7:10:44<6:28:28,  7.87s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2718/5680 [7:10:52<6:28:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4412', 'grad_norm': '0.3074', 'learning_rate': '0.0001068', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22265856, 'tokens/trainable': 22018470, 'epoch': '4.014'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2718/5680 [7:10:52<6:28:43,  7.87s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2719/5680 [7:11:00<6:27:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4356', 'grad_norm': '0.3096', 'learning_rate': '0.0001067', 'ppl': '1.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22274048, 'tokens/trainable': 22026614, 'epoch': '4.014'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2719/5680 [7:11:00<6:27:56,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2720/5680 [7:11:08<6:27:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4462', 'grad_norm': '0.2931', 'learning_rate': '0.0001067', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22282240, 'tokens/trainable': 22034756, 'epoch': '4.014'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2720/5680 [7:11:08<6:27:33,  7.86s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2721/5680 [7:11:16<6:27:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4652', 'grad_norm': '0.3512', 'learning_rate': '0.0001066', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22290432, 'tokens/trainable': 22042880, 'epoch': '4.014'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 2721/5680 [7:11:16<6:27:22,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2722/5680 [7:11:24<6:27:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6429', 'grad_norm': '0.3301', 'learning_rate': '0.0001066', 'ppl': '1.902', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22298624, 'tokens/trainable': 22051044, 'epoch': '4.015'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2722/5680 [7:11:24<6:27:25,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2723/5680 [7:11:31<6:26:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6901', 'grad_norm': '0.3366', 'learning_rate': '0.0001065', 'ppl': '1.994', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 22306816, 'tokens/trainable': 22059208, 'epoch': '4.015'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2723/5680 [7:11:31<6:26:43,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2724/5680 [7:11:39<6:26:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4595', 'grad_norm': '0.329', 'learning_rate': '0.0001065', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22315008, 'tokens/trainable': 22067352, 'epoch': '4.015'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2724/5680 [7:11:39<6:26:46,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2725/5680 [7:11:47<6:26:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5177', 'grad_norm': '0.32', 'learning_rate': '0.0001064', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22323200, 'tokens/trainable': 22075492, 'epoch': '4.015'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                    | 2725/5680 [7:11:47<6:26:29,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2726/5680 [7:11:55<6:26:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4331', 'grad_norm': '0.3252', 'learning_rate': '0.0001064', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22331392, 'tokens/trainable': 22083644, 'epoch': '4.015'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2726/5680 [7:11:55<6:26:22,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2727/5680 [7:12:03<6:26:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6676', 'grad_norm': '0.3361', 'learning_rate': '0.0001063', 'ppl': '1.95', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 22339584, 'tokens/trainable': 22091770, 'epoch': '4.015'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2727/5680 [7:12:03<6:26:27,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2728/5680 [7:12:11<6:26:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6967', 'grad_norm': '0.3375', 'learning_rate': '0.0001062', 'ppl': '2.007', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22347776, 'tokens/trainable': 22099870, 'epoch': '4.016'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2728/5680 [7:12:11<6:26:01,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2729/5680 [7:12:19<6:26:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6638', 'grad_norm': '0.3701', 'learning_rate': '0.0001062', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 22355968, 'tokens/trainable': 22107988, 'epoch': '4.016'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 2729/5680 [7:12:19<6:26:30,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 2730/5680 [7:12:26<6:26:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6253', 'grad_norm': '0.3539', 'learning_rate': '0.0001061', 'ppl': '1.869', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22364160, 'tokens/trainable': 22116174, 'epoch': '4.016'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 2730/5680 [7:12:26<6:26:41,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 2731/5680 [7:12:34<6:26:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6687', 'grad_norm': '0.3686', 'learning_rate': '0.0001061', 'ppl': '1.952', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 22372352, 'tokens/trainable': 22124280, 'epoch': '4.016'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 2731/5680 [7:12:34<6:26:52,  7.87s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 2732/5680 [7:12:43<6:31:37,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3171', 'grad_norm': '0.4485', 'learning_rate': '0.000106', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994', 'tokens/total': 22380544, 'tokens/trainable': 22132432, 'epoch': '4.016'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 2732/5680 [7:12:43<6:31:37,  7.97s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2733/5680 [7:12:50<6:29:34,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5032', 'grad_norm': '0.4675', 'learning_rate': '0.000106', 'ppl': '1.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22388736, 'tokens/trainable': 22140598, 'epoch': '4.017'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2733/5680 [7:12:50<6:29:34,  7.93s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2734/5680 [7:12:58<6:27:51,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5515', 'grad_norm': '0.3404', 'learning_rate': '0.0001059', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 22396928, 'tokens/trainable': 22148768, 'epoch': '4.017'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2734/5680 [7:12:58<6:27:51,  7.90s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2735/5680 [7:13:06<6:27:09,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5711', 'grad_norm': '0.331', 'learning_rate': '0.0001059', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22405120, 'tokens/trainable': 22156952, 'epoch': '4.017'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2735/5680 [7:13:06<6:27:09,  7.89s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2736/5680 [7:13:14<6:26:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.588', 'grad_norm': '0.3253', 'learning_rate': '0.0001058', 'ppl': '1.8', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22413312, 'tokens/trainable': 22165080, 'epoch': '4.017'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 2736/5680 [7:13:14<6:26:14,  7.87s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2737/5680 [7:13:22<6:25:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7262', 'grad_norm': '0.3898', 'learning_rate': '0.0001057', 'ppl': '2.067', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 22421504, 'tokens/trainable': 22173204, 'epoch': '4.017'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2737/5680 [7:13:22<6:25:57,  7.87s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2738/5680 [7:13:30<6:25:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5073', 'grad_norm': '0.3665', 'learning_rate': '0.0001057', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22429696, 'tokens/trainable': 22181336, 'epoch': '4.017'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2738/5680 [7:13:30<6:25:23,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2739/5680 [7:13:37<6:24:52,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.651', 'grad_norm': '0.4299', 'learning_rate': '0.0001056', 'ppl': '1.918', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22437888, 'tokens/trainable': 22189492, 'epoch': '4.018'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2739/5680 [7:13:37<6:24:52,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2740/5680 [7:13:45<6:24:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6012', 'grad_norm': '0.3422', 'learning_rate': '0.0001056', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 22446080, 'tokens/trainable': 22197648, 'epoch': '4.018'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 2740/5680 [7:13:45<6:24:31,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                   | 2741/5680 [7:13:53<6:24:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7424', 'grad_norm': '0.4062', 'learning_rate': '0.0001055', 'ppl': '2.101', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 22454272, 'tokens/trainable': 22205828, 'epoch': '4.018'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                   | 2741/5680 [7:13:53<6:24:20,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                   | 2742/5680 [7:14:01<6:24:21,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6721', 'grad_norm': '0.3848', 'learning_rate': '0.0001055', 'ppl': '1.958', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22462464, 'tokens/trainable': 22214000, 'epoch': '4.018'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                   | 2742/5680 [7:14:01<6:24:21,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                   | 2743/5680 [7:14:09<6:24:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5478', 'grad_norm': '0.3393', 'learning_rate': '0.0001054', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22470656, 'tokens/trainable': 22222176, 'epoch': '4.018'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                   | 2743/5680 [7:14:09<6:24:23,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2744/5680 [7:14:17<6:24:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3242', 'grad_norm': '0.2587', 'learning_rate': '0.0001054', 'ppl': '1.383', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22478848, 'tokens/trainable': 22230328, 'epoch': '4.018'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2744/5680 [7:14:17<6:24:25,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2745/5680 [7:14:25<6:24:09,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6693', 'grad_norm': '0.3152', 'learning_rate': '0.0001053', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22487040, 'tokens/trainable': 22238504, 'epoch': '4.019'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2745/5680 [7:14:25<6:24:09,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2746/5680 [7:14:32<6:23:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8815', 'grad_norm': '0.4754', 'learning_rate': '0.0001053', 'ppl': '2.415', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22495232, 'tokens/trainable': 22246652, 'epoch': '4.019'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2746/5680 [7:14:32<6:23:58,  7.85s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2747/5680 [7:14:40<6:24:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4937', 'grad_norm': '0.3448', 'learning_rate': '0.0001052', 'ppl': '1.638', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 22503424, 'tokens/trainable': 22254768, 'epoch': '4.019'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2747/5680 [7:14:40<6:24:06,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2748/5680 [7:14:48<6:23:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4261', 'grad_norm': '0.2994', 'learning_rate': '0.0001051', 'ppl': '1.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22511616, 'tokens/trainable': 22262952, 'epoch': '4.019'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2748/5680 [7:14:48<6:23:59,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2749/5680 [7:14:56<6:24:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5669', 'grad_norm': '0.395', 'learning_rate': '0.0001051', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22519808, 'tokens/trainable': 22271126, 'epoch': '4.019'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2749/5680 [7:14:56<6:24:25,  7.87s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2750/5680 [7:15:04<6:23:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6789', 'grad_norm': '0.413', 'learning_rate': '0.000105', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22528000, 'tokens/trainable': 22279242, 'epoch': '4.02'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2750/5680 [7:15:04<6:23:54,  7.86s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2751/5680 [7:15:12<6:23:14,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4086', 'grad_norm': '0.3378', 'learning_rate': '0.000105', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22536192, 'tokens/trainable': 22287358, 'epoch': '4.02'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 2751/5680 [7:15:12<6:23:14,  7.85s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 2752/5680 [7:15:20<6:23:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5609', 'grad_norm': '0.3021', 'learning_rate': '0.0001049', 'ppl': '1.752', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22544384, 'tokens/trainable': 22295508, 'epoch': '4.02'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 2752/5680 [7:15:20<6:23:04,  7.85s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 2753/5680 [7:15:27<6:22:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.692', 'grad_norm': '0.3039', 'learning_rate': '0.0001049', 'ppl': '1.998', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22552576, 'tokens/trainable': 22303650, 'epoch': '4.02'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 2753/5680 [7:15:27<6:22:51,  7.85s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 2754/5680 [7:15:35<6:22:29,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5835', 'grad_norm': '0.3243', 'learning_rate': '0.0001048', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 22560768, 'tokens/trainable': 22311842, 'epoch': '4.02'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 2754/5680 [7:15:35<6:22:29,  7.84s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2755/5680 [7:15:43<6:21:56,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5289', 'grad_norm': '0.3178', 'learning_rate': '0.0001048', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22568960, 'tokens/trainable': 22319924, 'epoch': '4.02'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2755/5680 [7:15:43<6:21:56,  7.83s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2756/5680 [7:15:51<6:26:26,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3803', 'grad_norm': '0.2603', 'learning_rate': '0.0001047', 'ppl': '1.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 22577152, 'tokens/trainable': 22328080, 'epoch': '4.021'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2756/5680 [7:15:51<6:26:26,  7.93s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2757/5680 [7:15:59<6:25:20,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7929', 'grad_norm': '0.3966', 'learning_rate': '0.0001046', 'ppl': '2.21', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22585344, 'tokens/trainable': 22336256, 'epoch': '4.021'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2757/5680 [7:15:59<6:25:20,  7.91s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2758/5680 [7:16:07<6:24:36,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5483', 'grad_norm': '0.3112', 'learning_rate': '0.0001046', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22593536, 'tokens/trainable': 22344404, 'epoch': '4.021'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 2758/5680 [7:16:07<6:24:36,  7.90s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2759/5680 [7:16:15<6:23:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7528', 'grad_norm': '0.4108', 'learning_rate': '0.0001045', 'ppl': '2.123', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 22601728, 'tokens/trainable': 22352580, 'epoch': '4.021'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2759/5680 [7:16:15<6:23:20,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2760/5680 [7:16:23<6:24:10,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4015', 'grad_norm': '0.324', 'learning_rate': '0.0001045', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22609920, 'tokens/trainable': 22360752, 'epoch': '4.021'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2760/5680 [7:16:23<6:24:10,  7.89s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2761/5680 [7:16:31<6:23:49,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5274', 'grad_norm': '0.4052', 'learning_rate': '0.0001044', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 22618112, 'tokens/trainable': 22368842, 'epoch': '4.021'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2761/5680 [7:16:31<6:23:49,  7.89s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2762/5680 [7:16:38<6:23:29,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6237', 'grad_norm': '0.3397', 'learning_rate': '0.0001044', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22626304, 'tokens/trainable': 22377016, 'epoch': '4.022'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 2762/5680 [7:16:38<6:23:29,  7.89s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2763/5680 [7:16:46<6:23:04,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4571', 'grad_norm': '0.2942', 'learning_rate': '0.0001043', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22634496, 'tokens/trainable': 22385180, 'epoch': '4.022'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2763/5680 [7:16:46<6:23:04,  7.88s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2764/5680 [7:16:54<6:22:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8402', 'grad_norm': '0.3888', 'learning_rate': '0.0001043', 'ppl': '2.317', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22642688, 'tokens/trainable': 22393312, 'epoch': '4.022'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2764/5680 [7:16:54<6:22:40,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2765/5680 [7:17:02<6:21:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5472', 'grad_norm': '0.3056', 'learning_rate': '0.0001042', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22650880, 'tokens/trainable': 22401436, 'epoch': '4.022'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2765/5680 [7:17:02<6:21:41,  7.86s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2766/5680 [7:17:10<6:22:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6584', 'grad_norm': '0.3248', 'learning_rate': '0.0001041', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 22659072, 'tokens/trainable': 22409588, 'epoch': '4.022'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 2766/5680 [7:17:10<6:22:09,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 2767/5680 [7:17:18<6:21:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.709', 'grad_norm': '0.3843', 'learning_rate': '0.0001041', 'ppl': '2.032', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22667264, 'tokens/trainable': 22417732, 'epoch': '4.023'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 2767/5680 [7:17:18<6:21:26,  7.86s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 2768/5680 [7:17:26<6:21:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7957', 'grad_norm': '0.3759', 'learning_rate': '0.000104', 'ppl': '2.216', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 22675456, 'tokens/trainable': 22425904, 'epoch': '4.023'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 2768/5680 [7:17:26<6:21:00,  7.85s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 2769/5680 [7:17:33<6:21:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4138', 'grad_norm': '0.3687', 'learning_rate': '0.000104', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22683648, 'tokens/trainable': 22434092, 'epoch': '4.023'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 2769/5680 [7:17:33<6:21:14,  7.86s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2770/5680 [7:17:41<6:22:12,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5523', 'grad_norm': '0.3257', 'learning_rate': '0.0001039', 'ppl': '1.737', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 22691840, 'tokens/trainable': 22442244, 'epoch': '4.023'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2770/5680 [7:17:41<6:22:12,  7.88s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2771/5680 [7:17:49<6:21:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5415', 'grad_norm': '0.3194', 'learning_rate': '0.0001039', 'ppl': '1.719', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22700032, 'tokens/trainable': 22450384, 'epoch': '4.023'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2771/5680 [7:17:49<6:21:35,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2772/5680 [7:17:57<6:21:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.757', 'grad_norm': '0.3409', 'learning_rate': '0.0001038', 'ppl': '2.132', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22708224, 'tokens/trainable': 22458566, 'epoch': '4.023'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2772/5680 [7:17:57<6:21:25,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2773/5680 [7:18:05<6:21:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5286', 'grad_norm': '0.3082', 'learning_rate': '0.0001038', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 22716416, 'tokens/trainable': 22466696, 'epoch': '4.024'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 2773/5680 [7:18:05<6:21:15,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2774/5680 [7:18:13<6:20:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5439', 'grad_norm': '0.3347', 'learning_rate': '0.0001037', 'ppl': '1.723', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22724608, 'tokens/trainable': 22474832, 'epoch': '4.024'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2774/5680 [7:18:13<6:20:51,  7.86s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2775/5680 [7:18:21<6:21:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4475', 'grad_norm': '0.2872', 'learning_rate': '0.0001036', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 22732800, 'tokens/trainable': 22482994, 'epoch': '4.024'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2775/5680 [7:18:21<6:21:12,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2776/5680 [7:18:29<6:21:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5829', 'grad_norm': '0.4144', 'learning_rate': '0.0001036', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 22740992, 'tokens/trainable': 22491152, 'epoch': '4.024'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2776/5680 [7:18:29<6:21:34,  7.88s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2777/5680 [7:18:36<6:20:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3404', 'grad_norm': '0.3113', 'learning_rate': '0.0001035', 'ppl': '1.406', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22749184, 'tokens/trainable': 22499300, 'epoch': '4.024'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 2777/5680 [7:18:36<6:20:53,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2778/5680 [7:18:44<6:20:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7602', 'grad_norm': '0.3517', 'learning_rate': '0.0001035', 'ppl': '2.139', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22757376, 'tokens/trainable': 22507484, 'epoch': '4.024'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2778/5680 [7:18:44<6:20:46,  7.87s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2779/5680 [7:18:52<6:20:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7095', 'grad_norm': '0.4006', 'learning_rate': '0.0001034', 'ppl': '2.033', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22765568, 'tokens/trainable': 22515624, 'epoch': '4.025'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2779/5680 [7:18:52<6:20:08,  7.86s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2780/5680 [7:19:00<6:20:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4013', 'grad_norm': '0.352', 'learning_rate': '0.0001034', 'ppl': '1.494', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22773760, 'tokens/trainable': 22523760, 'epoch': '4.025'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2780/5680 [7:19:00<6:20:04,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2781/5680 [7:19:08<6:20:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3561', 'grad_norm': '0.3809', 'learning_rate': '0.0001033', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22781952, 'tokens/trainable': 22531924, 'epoch': '4.025'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2781/5680 [7:19:08<6:20:06,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2782/5680 [7:19:16<6:19:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6373', 'grad_norm': '0.3621', 'learning_rate': '0.0001033', 'ppl': '1.891', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22790144, 'tokens/trainable': 22540084, 'epoch': '4.025'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2782/5680 [7:19:16<6:19:28,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2783/5680 [7:19:24<6:19:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4074', 'grad_norm': '0.3103', 'learning_rate': '0.0001032', 'ppl': '1.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 22798336, 'tokens/trainable': 22548194, 'epoch': '4.025'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2783/5680 [7:19:24<6:19:28,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2784/5680 [7:19:31<6:18:40,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6045', 'grad_norm': '0.4024', 'learning_rate': '0.0001032', 'ppl': '1.83', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22806528, 'tokens/trainable': 22556288, 'epoch': '4.026'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 2784/5680 [7:19:31<6:18:40,  7.85s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2785/5680 [7:19:39<6:18:28,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6878', 'grad_norm': '0.4395', 'learning_rate': '0.0001031', 'ppl': '1.989', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22814720, 'tokens/trainable': 22564436, 'epoch': '4.026'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2785/5680 [7:19:39<6:18:28,  7.84s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2786/5680 [7:19:47<6:18:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5525', 'grad_norm': '0.3208', 'learning_rate': '0.000103', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 22822912, 'tokens/trainable': 22572604, 'epoch': '4.026'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2786/5680 [7:19:47<6:18:57,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2787/5680 [7:19:55<6:18:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4443', 'grad_norm': '0.2936', 'learning_rate': '0.000103', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22831104, 'tokens/trainable': 22580772, 'epoch': '4.026'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2787/5680 [7:19:55<6:18:56,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2788/5680 [7:20:03<6:19:51,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5671', 'grad_norm': '0.4869', 'learning_rate': '0.0001029', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 22839296, 'tokens/trainable': 22588960, 'epoch': '4.026'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 2788/5680 [7:20:03<6:19:51,  7.88s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 2789/5680 [7:20:11<6:19:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6049', 'grad_norm': '0.3409', 'learning_rate': '0.0001029', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 22847488, 'tokens/trainable': 22597142, 'epoch': '4.026'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 2789/5680 [7:20:11<6:19:24,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 2790/5680 [7:20:19<6:18:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7672', 'grad_norm': '0.3517', 'learning_rate': '0.0001028', 'ppl': '2.154', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 22855680, 'tokens/trainable': 22605326, 'epoch': '4.027'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 2790/5680 [7:20:19<6:18:42,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 2791/5680 [7:20:26<6:18:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5509', 'grad_norm': '0.3135', 'learning_rate': '0.0001028', 'ppl': '1.735', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22863872, 'tokens/trainable': 22613488, 'epoch': '4.027'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 2791/5680 [7:20:26<6:18:29,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2792/5680 [7:20:34<6:18:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3421', 'grad_norm': '0.2754', 'learning_rate': '0.0001027', 'ppl': '1.408', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22872064, 'tokens/trainable': 22621676, 'epoch': '4.027'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2792/5680 [7:20:34<6:18:42,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2793/5680 [7:20:42<6:18:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.501', 'grad_norm': '0.267', 'learning_rate': '0.0001027', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 22880256, 'tokens/trainable': 22629868, 'epoch': '4.027'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2793/5680 [7:20:42<6:18:51,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2794/5680 [7:20:50<6:18:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.541', 'grad_norm': '0.3308', 'learning_rate': '0.0001026', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 22888448, 'tokens/trainable': 22637976, 'epoch': '4.027'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2794/5680 [7:20:50<6:18:17,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2795/5680 [7:20:58<6:17:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5805', 'grad_norm': '0.304', 'learning_rate': '0.0001025', 'ppl': '1.787', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 22896640, 'tokens/trainable': 22646144, 'epoch': '4.027'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 2795/5680 [7:20:58<6:17:39,  7.85s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2796/5680 [7:21:06<6:17:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7707', 'grad_norm': '0.3444', 'learning_rate': '0.0001025', 'ppl': '2.161', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22904832, 'tokens/trainable': 22654308, 'epoch': '4.028'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2796/5680 [7:21:06<6:17:30,  7.85s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2797/5680 [7:21:14<6:17:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4989', 'grad_norm': '0.322', 'learning_rate': '0.0001024', 'ppl': '1.647', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22913024, 'tokens/trainable': 22662428, 'epoch': '4.028'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2797/5680 [7:21:14<6:17:48,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2798/5680 [7:21:22<6:18:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3019', 'grad_norm': '0.2792', 'learning_rate': '0.0001024', 'ppl': '1.352', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 22921216, 'tokens/trainable': 22670592, 'epoch': '4.028'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2798/5680 [7:21:22<6:18:20,  7.88s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2799/5680 [7:21:29<6:18:16,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5336', 'grad_norm': '0.388', 'learning_rate': '0.0001023', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22929408, 'tokens/trainable': 22678758, 'epoch': '4.028'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 2799/5680 [7:21:29<6:18:16,  7.88s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2800/5680 [7:21:37<6:17:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4865', 'grad_norm': '0.344', 'learning_rate': '0.0001023', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 22937600, 'tokens/trainable': 22686884, 'epoch': '4.028'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2800/5680 [7:21:37<6:17:48,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2801/5680 [7:21:45<6:18:16,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6231', 'grad_norm': '0.342', 'learning_rate': '0.0001022', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 22945792, 'tokens/trainable': 22695040, 'epoch': '4.029'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2801/5680 [7:21:45<6:18:16,  7.88s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2802/5680 [7:21:53<6:17:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5266', 'grad_norm': '0.354', 'learning_rate': '0.0001022', 'ppl': '1.693', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 22953984, 'tokens/trainable': 22703188, 'epoch': '4.029'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2802/5680 [7:21:53<6:17:37,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2803/5680 [7:22:01<6:16:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6831', 'grad_norm': '0.3942', 'learning_rate': '0.0001021', 'ppl': '1.98', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 22962176, 'tokens/trainable': 22711306, 'epoch': '4.029'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 2803/5680 [7:22:01<6:16:59,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 2804/5680 [7:22:09<6:17:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.438', 'grad_norm': '0.335', 'learning_rate': '0.000102', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 22970368, 'tokens/trainable': 22719492, 'epoch': '4.029'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 2804/5680 [7:22:09<6:17:00,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 2805/5680 [7:22:17<6:16:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6454', 'grad_norm': '0.3529', 'learning_rate': '0.000102', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 22978560, 'tokens/trainable': 22727632, 'epoch': '4.029'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 2805/5680 [7:22:17<6:16:16,  7.85s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 2806/5680 [7:22:24<6:16:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5838', 'grad_norm': '0.3478', 'learning_rate': '0.0001019', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 22986752, 'tokens/trainable': 22735774, 'epoch': '4.029'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 2806/5680 [7:22:24<6:16:52,  7.87s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2807/5680 [7:22:32<6:16:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5916', 'grad_norm': '0.3223', 'learning_rate': '0.0001019', 'ppl': '1.807', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 22994944, 'tokens/trainable': 22743944, 'epoch': '4.03'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2807/5680 [7:22:32<6:16:21,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2808/5680 [7:22:40<6:16:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6422', 'grad_norm': '0.3315', 'learning_rate': '0.0001018', 'ppl': '1.901', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23003136, 'tokens/trainable': 22752086, 'epoch': '4.03'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2808/5680 [7:22:40<6:16:12,  7.86s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2809/5680 [7:22:48<6:16:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4531', 'grad_norm': '0.3468', 'learning_rate': '0.0001018', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 23011328, 'tokens/trainable': 22760208, 'epoch': '4.03'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2809/5680 [7:22:48<6:16:53,  7.88s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2810/5680 [7:22:56<6:16:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4909', 'grad_norm': '0.3732', 'learning_rate': '0.0001017', 'ppl': '1.634', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23019520, 'tokens/trainable': 22768352, 'epoch': '4.03'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2810/5680 [7:22:56<6:16:23,  7.87s/it] 49%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2811/5680 [7:23:04<6:19:44,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4597', 'grad_norm': '0.3223', 'learning_rate': '0.0001017', 'ppl': '1.584', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 23027712, 'tokens/trainable': 22776522, 'epoch': '4.03'}
 49%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2811/5680 [7:23:04<6:19:44,  7.94s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2812/5680 [7:23:12<6:18:36,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4378', 'grad_norm': '0.331', 'learning_rate': '0.0001016', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 23035904, 'tokens/trainable': 22784660, 'epoch': '4.03'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2812/5680 [7:23:12<6:18:36,  7.92s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2813/5680 [7:23:20<6:18:06,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3064', 'grad_norm': '0.462', 'learning_rate': '0.0001015', 'ppl': '1.359', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 23044096, 'tokens/trainable': 22792788, 'epoch': '4.031'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2813/5680 [7:23:20<6:18:06,  7.91s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2814/5680 [7:23:28<6:17:23,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4769', 'grad_norm': '0.3198', 'learning_rate': '0.0001015', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 23052288, 'tokens/trainable': 22800900, 'epoch': '4.031'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 2814/5680 [7:23:28<6:17:23,  7.90s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 2815/5680 [7:23:36<6:16:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6715', 'grad_norm': '0.3534', 'learning_rate': '0.0001014', 'ppl': '1.957', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 23060480, 'tokens/trainable': 22809060, 'epoch': '4.031'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 2815/5680 [7:23:36<6:16:05,  7.88s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 2816/5680 [7:23:43<6:15:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7107', 'grad_norm': '0.3568', 'learning_rate': '0.0001014', 'ppl': '2.035', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23068672, 'tokens/trainable': 22817204, 'epoch': '4.031'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 2816/5680 [7:23:43<6:15:51,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 2817/5680 [7:23:51<6:15:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5244', 'grad_norm': '0.3784', 'learning_rate': '0.0001013', 'ppl': '1.69', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23076864, 'tokens/trainable': 22825328, 'epoch': '4.031'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 2817/5680 [7:23:51<6:15:09,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2818/5680 [7:23:59<6:14:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3815', 'grad_norm': '0.3337', 'learning_rate': '0.0001013', 'ppl': '1.464', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 23085056, 'tokens/trainable': 22833520, 'epoch': '4.032'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2818/5680 [7:23:59<6:14:59,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2819/5680 [7:24:07<6:14:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4338', 'grad_norm': '0.442', 'learning_rate': '0.0001012', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23093248, 'tokens/trainable': 22841668, 'epoch': '4.032'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2819/5680 [7:24:07<6:14:40,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2820/5680 [7:24:15<6:14:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6659', 'grad_norm': '0.4493', 'learning_rate': '0.0001012', 'ppl': '1.946', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23101440, 'tokens/trainable': 22849822, 'epoch': '4.032'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2820/5680 [7:24:15<6:14:27,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2821/5680 [7:24:23<6:14:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4342', 'grad_norm': '0.302', 'learning_rate': '0.0001011', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23109632, 'tokens/trainable': 22857970, 'epoch': '4.032'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 2821/5680 [7:24:23<6:14:12,  7.85s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2822/5680 [7:24:30<6:14:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4825', 'grad_norm': '0.341', 'learning_rate': '0.0001011', 'ppl': '1.62', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23117824, 'tokens/trainable': 22866136, 'epoch': '4.032'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2822/5680 [7:24:30<6:14:04,  7.85s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2823/5680 [7:24:38<6:14:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5953', 'grad_norm': '0.4298', 'learning_rate': '0.000101', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23126016, 'tokens/trainable': 22874280, 'epoch': '4.032'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2823/5680 [7:24:38<6:14:27,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2824/5680 [7:24:46<6:14:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4351', 'grad_norm': '0.2911', 'learning_rate': '0.0001009', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23134208, 'tokens/trainable': 22882452, 'epoch': '4.033'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2824/5680 [7:24:46<6:14:35,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2825/5680 [7:24:54<6:13:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4864', 'grad_norm': '0.3268', 'learning_rate': '0.0001009', 'ppl': '1.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23142400, 'tokens/trainable': 22890572, 'epoch': '4.033'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 2825/5680 [7:24:54<6:13:57,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2826/5680 [7:25:02<6:13:13,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6305', 'grad_norm': '0.3259', 'learning_rate': '0.0001008', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 23150592, 'tokens/trainable': 22898738, 'epoch': '4.033'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2826/5680 [7:25:02<6:13:13,  7.85s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2827/5680 [7:25:10<6:13:13,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6072', 'grad_norm': '0.298', 'learning_rate': '0.0001008', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 23158784, 'tokens/trainable': 22906848, 'epoch': '4.033'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2827/5680 [7:25:10<6:13:13,  7.85s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2828/5680 [7:25:18<6:13:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4587', 'grad_norm': '0.2887', 'learning_rate': '0.0001007', 'ppl': '1.582', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23166976, 'tokens/trainable': 22915024, 'epoch': '4.033'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2828/5680 [7:25:18<6:13:20,  7.85s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2829/5680 [7:25:25<6:12:14,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5404', 'grad_norm': '0.3082', 'learning_rate': '0.0001007', 'ppl': '1.717', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1052', 'tokens/total': 23175168, 'tokens/trainable': 22923212, 'epoch': '4.033'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2829/5680 [7:25:25<6:12:14,  7.83s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2830/5680 [7:25:33<6:12:23,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6499', 'grad_norm': '0.374', 'learning_rate': '0.0001006', 'ppl': '1.915', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23183360, 'tokens/trainable': 22931382, 'epoch': '4.034'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2830/5680 [7:25:33<6:12:23,  7.84s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2831/5680 [7:25:41<6:12:36,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7093', 'grad_norm': '0.4876', 'learning_rate': '0.0001006', 'ppl': '2.033', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 23191552, 'tokens/trainable': 22939500, 'epoch': '4.034'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2831/5680 [7:25:41<6:12:36,  7.85s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2832/5680 [7:25:49<6:13:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6135', 'grad_norm': '0.3139', 'learning_rate': '0.0001005', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23199744, 'tokens/trainable': 22947636, 'epoch': '4.034'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2832/5680 [7:25:49<6:13:03,  7.86s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2833/5680 [7:25:57<6:13:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4338', 'grad_norm': '0.2895', 'learning_rate': '0.0001004', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23207936, 'tokens/trainable': 22955798, 'epoch': '4.034'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2833/5680 [7:25:57<6:13:38,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2834/5680 [7:26:05<6:13:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4829', 'grad_norm': '0.2868', 'learning_rate': '0.0001004', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23216128, 'tokens/trainable': 22963972, 'epoch': '4.034'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2834/5680 [7:26:05<6:13:11,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2835/5680 [7:26:13<6:13:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5317', 'grad_norm': '0.3607', 'learning_rate': '0.0001003', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 23224320, 'tokens/trainable': 22972084, 'epoch': '4.035'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2835/5680 [7:26:13<6:13:03,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2836/5680 [7:26:20<6:12:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5477', 'grad_norm': '0.3788', 'learning_rate': '0.0001003', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 23232512, 'tokens/trainable': 22980212, 'epoch': '4.035'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 2836/5680 [7:26:20<6:12:53,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                | 2837/5680 [7:26:28<6:13:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5538', 'grad_norm': '0.3433', 'learning_rate': '0.0001002', 'ppl': '1.74', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 23240704, 'tokens/trainable': 22988334, 'epoch': '4.035'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                | 2837/5680 [7:26:28<6:13:04,  7.87s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                | 2838/5680 [7:26:36<6:13:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3686', 'grad_norm': '0.3216', 'learning_rate': '0.0001002', 'ppl': '1.446', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 23248896, 'tokens/trainable': 22996428, 'epoch': '4.035'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                | 2838/5680 [7:26:36<6:13:13,  7.88s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                | 2839/5680 [7:26:44<6:12:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3511', 'grad_norm': '0.3386', 'learning_rate': '0.0001001', 'ppl': '1.421', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23257088, 'tokens/trainable': 23004584, 'epoch': '4.035'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                | 2839/5680 [7:26:44<6:12:46,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2840/5680 [7:26:52<6:12:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5757', 'grad_norm': '0.3628', 'learning_rate': '0.0001001', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 23265280, 'tokens/trainable': 23012664, 'epoch': '4.035'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2840/5680 [7:26:52<6:12:49,  7.88s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2841/5680 [7:27:00<6:12:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4969', 'grad_norm': '0.3357', 'learning_rate': '0.0001', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 23273472, 'tokens/trainable': 23020828, 'epoch': '4.036'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2841/5680 [7:27:00<6:12:05,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2842/5680 [7:27:08<6:11:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7083', 'grad_norm': '0.423', 'learning_rate': '9.994e-05', 'ppl': '2.031', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 23281664, 'tokens/trainable': 23028946, 'epoch': '4.036'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2842/5680 [7:27:08<6:11:56,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2843/5680 [7:27:16<6:12:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5738', 'grad_norm': '0.3439', 'learning_rate': '9.989e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23289856, 'tokens/trainable': 23037096, 'epoch': '4.036'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 2843/5680 [7:27:16<6:12:00,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2844/5680 [7:27:23<6:11:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6249', 'grad_norm': '0.3521', 'learning_rate': '9.983e-05', 'ppl': '1.868', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23298048, 'tokens/trainable': 23045264, 'epoch': '4.036'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2844/5680 [7:27:23<6:11:56,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2845/5680 [7:27:31<6:11:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5224', 'grad_norm': '0.4072', 'learning_rate': '9.978e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 23306240, 'tokens/trainable': 23053446, 'epoch': '4.036'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2845/5680 [7:27:31<6:11:34,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2846/5680 [7:27:39<6:11:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4843', 'grad_norm': '0.3326', 'learning_rate': '9.972e-05', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 23314432, 'tokens/trainable': 23061568, 'epoch': '4.036'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2846/5680 [7:27:39<6:11:42,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2847/5680 [7:27:47<6:11:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4416', 'grad_norm': '0.3067', 'learning_rate': '9.967e-05', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 23322624, 'tokens/trainable': 23069632, 'epoch': '4.037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 2847/5680 [7:27:47<6:11:52,  7.88s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2848/5680 [7:27:55<6:12:22,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3988', 'grad_norm': '0.3197', 'learning_rate': '9.961e-05', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 23330816, 'tokens/trainable': 23077770, 'epoch': '4.037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2848/5680 [7:27:55<6:12:22,  7.89s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2849/5680 [7:28:03<6:11:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5245', 'grad_norm': '0.3078', 'learning_rate': '9.956e-05', 'ppl': '1.69', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23339008, 'tokens/trainable': 23085920, 'epoch': '4.037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2849/5680 [7:28:03<6:11:53,  7.88s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2850/5680 [7:28:11<6:15:14,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5641', 'grad_norm': '0.3475', 'learning_rate': '9.95e-05', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 23347200, 'tokens/trainable': 23094076, 'epoch': '4.037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2850/5680 [7:28:11<6:15:14,  7.96s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2851/5680 [7:28:19<6:13:51,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6391', 'grad_norm': '0.3', 'learning_rate': '9.945e-05', 'ppl': '1.895', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23355392, 'tokens/trainable': 23102226, 'epoch': '4.037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 2851/5680 [7:28:19<6:13:51,  7.93s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 2852/5680 [7:28:27<6:13:21,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5513', 'grad_norm': '0.4312', 'learning_rate': '9.939e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23363584, 'tokens/trainable': 23110380, 'epoch': '4.037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 2852/5680 [7:28:27<6:13:21,  7.92s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 2853/5680 [7:28:35<6:11:53,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5722', 'grad_norm': '0.3732', 'learning_rate': '9.934e-05', 'ppl': '1.772', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23371776, 'tokens/trainable': 23118490, 'epoch': '4.038'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 2853/5680 [7:28:35<6:11:53,  7.89s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 2854/5680 [7:28:43<6:11:52,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4558', 'grad_norm': '0.3327', 'learning_rate': '9.928e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23379968, 'tokens/trainable': 23126674, 'epoch': '4.038'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 2854/5680 [7:28:43<6:11:52,  7.90s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2855/5680 [7:28:50<6:10:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4181', 'grad_norm': '0.2757', 'learning_rate': '9.923e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23388160, 'tokens/trainable': 23134798, 'epoch': '4.038'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2855/5680 [7:28:50<6:10:57,  7.88s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2856/5680 [7:28:58<6:10:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.455', 'grad_norm': '0.4127', 'learning_rate': '9.917e-05', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23396352, 'tokens/trainable': 23142952, 'epoch': '4.038'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2856/5680 [7:28:58<6:10:13,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2857/5680 [7:29:06<6:09:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3998', 'grad_norm': '0.3153', 'learning_rate': '9.912e-05', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23404544, 'tokens/trainable': 23151120, 'epoch': '4.038'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2857/5680 [7:29:06<6:09:57,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2858/5680 [7:29:14<6:09:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.9008', 'grad_norm': '0.39', 'learning_rate': '9.906e-05', 'ppl': '2.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23412736, 'tokens/trainable': 23159272, 'epoch': '4.039'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 2858/5680 [7:29:14<6:09:34,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2859/5680 [7:29:22<6:09:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.54', 'grad_norm': '0.3621', 'learning_rate': '9.9e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 23420928, 'tokens/trainable': 23167392, 'epoch': '4.039'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2859/5680 [7:29:22<6:09:22,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2860/5680 [7:29:30<6:09:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7681', 'grad_norm': '0.3673', 'learning_rate': '9.895e-05', 'ppl': '2.156', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23429120, 'tokens/trainable': 23175542, 'epoch': '4.039'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2860/5680 [7:29:30<6:09:22,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2861/5680 [7:29:37<6:09:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5217', 'grad_norm': '0.3172', 'learning_rate': '9.889e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23437312, 'tokens/trainable': 23183704, 'epoch': '4.039'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2861/5680 [7:29:37<6:09:04,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2862/5680 [7:29:45<6:09:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6001', 'grad_norm': '0.3339', 'learning_rate': '9.884e-05', 'ppl': '1.822', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23445504, 'tokens/trainable': 23191836, 'epoch': '4.039'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 2862/5680 [7:29:45<6:09:00,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 2863/5680 [7:29:53<6:09:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4591', 'grad_norm': '0.3601', 'learning_rate': '9.878e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23453696, 'tokens/trainable': 23200012, 'epoch': '4.039'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 2863/5680 [7:29:53<6:09:28,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 2864/5680 [7:30:01<6:09:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7283', 'grad_norm': '0.3679', 'learning_rate': '9.873e-05', 'ppl': '2.072', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 23461888, 'tokens/trainable': 23208126, 'epoch': '4.04'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 2864/5680 [7:30:01<6:09:24,  7.87s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 2865/5680 [7:30:09<6:08:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.397', 'grad_norm': '0.3157', 'learning_rate': '9.867e-05', 'ppl': '1.487', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23470080, 'tokens/trainable': 23216278, 'epoch': '4.04'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 2865/5680 [7:30:09<6:08:47,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2866/5680 [7:30:17<6:08:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6444', 'grad_norm': '0.2947', 'learning_rate': '9.862e-05', 'ppl': '1.905', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23478272, 'tokens/trainable': 23224428, 'epoch': '4.04'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2866/5680 [7:30:17<6:08:27,  7.86s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2867/5680 [7:30:25<6:08:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6399', 'grad_norm': '0.368', 'learning_rate': '9.856e-05', 'ppl': '1.896', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 23486464, 'tokens/trainable': 23232600, 'epoch': '4.04'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2867/5680 [7:30:25<6:08:07,  7.85s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2868/5680 [7:30:32<6:08:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4898', 'grad_norm': '0.3306', 'learning_rate': '9.851e-05', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23494656, 'tokens/trainable': 23240736, 'epoch': '4.04'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2868/5680 [7:30:32<6:08:03,  7.85s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2869/5680 [7:30:40<6:07:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4656', 'grad_norm': '0.3709', 'learning_rate': '9.845e-05', 'ppl': '1.593', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23502848, 'tokens/trainable': 23248872, 'epoch': '4.04'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 2869/5680 [7:30:40<6:07:37,  7.85s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2870/5680 [7:30:48<6:12:22,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4068', 'grad_norm': '0.4545', 'learning_rate': '9.84e-05', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.6', 'tokens/total': 23511040, 'tokens/trainable': 23257044, 'epoch': '4.041'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2870/5680 [7:30:48<6:12:22,  7.95s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2871/5680 [7:30:56<6:11:12,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.72', 'grad_norm': '0.5332', 'learning_rate': '9.834e-05', 'ppl': '2.054', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23519232, 'tokens/trainable': 23265214, 'epoch': '4.041'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2871/5680 [7:30:56<6:11:12,  7.93s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2872/5680 [7:31:04<6:10:34,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5244', 'grad_norm': '0.375', 'learning_rate': '9.829e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23527424, 'tokens/trainable': 23273392, 'epoch': '4.041'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2872/5680 [7:31:04<6:10:34,  7.92s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2873/5680 [7:31:12<6:10:14,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3591', 'grad_norm': '0.2874', 'learning_rate': '9.823e-05', 'ppl': '1.432', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 23535616, 'tokens/trainable': 23281560, 'epoch': '4.041'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2873/5680 [7:31:12<6:10:14,  7.91s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 2874/5680 [7:31:20<6:08:46,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5052', 'grad_norm': '0.3322', 'learning_rate': '9.817e-05', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23543808, 'tokens/trainable': 23289684, 'epoch': '4.041'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 2874/5680 [7:31:20<6:08:46,  7.89s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 2875/5680 [7:31:28<6:08:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3773', 'grad_norm': '0.2935', 'learning_rate': '9.812e-05', 'ppl': '1.458', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 23552000, 'tokens/trainable': 23297760, 'epoch': '4.042'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 2875/5680 [7:31:28<6:08:19,  7.88s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 2876/5680 [7:31:36<6:07:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6901', 'grad_norm': '0.3542', 'learning_rate': '9.806e-05', 'ppl': '1.994', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23560192, 'tokens/trainable': 23305908, 'epoch': '4.042'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 2876/5680 [7:31:36<6:07:37,  7.87s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2877/5680 [7:31:44<6:07:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5255', 'grad_norm': '0.3226', 'learning_rate': '9.801e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23568384, 'tokens/trainable': 23314072, 'epoch': '4.042'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2877/5680 [7:31:44<6:07:29,  7.87s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2878/5680 [7:31:51<6:07:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5572', 'grad_norm': '0.3201', 'learning_rate': '9.795e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23576576, 'tokens/trainable': 23322208, 'epoch': '4.042'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2878/5680 [7:31:51<6:07:36,  7.87s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2879/5680 [7:31:59<6:07:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5802', 'grad_norm': '0.4059', 'learning_rate': '9.79e-05', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23584768, 'tokens/trainable': 23330324, 'epoch': '4.042'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2879/5680 [7:31:59<6:07:23,  7.87s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2880/5680 [7:32:07<6:07:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4405', 'grad_norm': '0.3826', 'learning_rate': '9.784e-05', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23592960, 'tokens/trainable': 23338424, 'epoch': '4.042'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2880/5680 [7:32:07<6:07:01,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2881/5680 [7:32:15<6:06:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7262', 'grad_norm': '0.3651', 'learning_rate': '9.779e-05', 'ppl': '2.067', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23601152, 'tokens/trainable': 23346538, 'epoch': '4.043'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2881/5680 [7:32:15<6:06:31,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2882/5680 [7:32:23<6:06:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4557', 'grad_norm': '0.3281', 'learning_rate': '9.773e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23609344, 'tokens/trainable': 23354716, 'epoch': '4.043'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2882/5680 [7:32:23<6:06:43,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2883/5680 [7:32:31<6:07:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5555', 'grad_norm': '0.3166', 'learning_rate': '9.768e-05', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 23617536, 'tokens/trainable': 23362840, 'epoch': '4.043'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2883/5680 [7:32:31<6:07:20,  7.88s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2884/5680 [7:32:39<6:07:48,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5866', 'grad_norm': '0.4535', 'learning_rate': '9.762e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 23625728, 'tokens/trainable': 23370982, 'epoch': '4.043'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 2884/5680 [7:32:39<6:07:48,  7.89s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2885/5680 [7:32:47<6:07:01,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5447', 'grad_norm': '0.4007', 'learning_rate': '9.757e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23633920, 'tokens/trainable': 23379136, 'epoch': '4.043'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2885/5680 [7:32:47<6:07:01,  7.88s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2886/5680 [7:32:54<6:06:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4037', 'grad_norm': '0.3029', 'learning_rate': '9.751e-05', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23642112, 'tokens/trainable': 23387284, 'epoch': '4.043'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2886/5680 [7:32:54<6:06:22,  7.87s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2887/5680 [7:33:02<6:06:39,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5035', 'grad_norm': '0.3374', 'learning_rate': '9.746e-05', 'ppl': '1.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 23650304, 'tokens/trainable': 23395416, 'epoch': '4.044'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2887/5680 [7:33:02<6:06:39,  7.88s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2888/5680 [7:33:10<6:05:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6056', 'grad_norm': '0.3309', 'learning_rate': '9.74e-05', 'ppl': '1.832', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 23658496, 'tokens/trainable': 23403592, 'epoch': '4.044'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 2888/5680 [7:33:10<6:05:47,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 2889/5680 [7:33:18<6:05:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6922', 'grad_norm': '0.439', 'learning_rate': '9.735e-05', 'ppl': '1.998', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23666688, 'tokens/trainable': 23411752, 'epoch': '4.044'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 2889/5680 [7:33:18<6:05:29,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 2890/5680 [7:33:26<6:05:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8039', 'grad_norm': '0.3563', 'learning_rate': '9.729e-05', 'ppl': '2.234', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23674880, 'tokens/trainable': 23419912, 'epoch': '4.044'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 2890/5680 [7:33:26<6:05:10,  7.85s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 2891/5680 [7:33:34<6:05:02,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6189', 'grad_norm': '0.3448', 'learning_rate': '9.723e-05', 'ppl': '1.857', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 23683072, 'tokens/trainable': 23428088, 'epoch': '4.044'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 2891/5680 [7:33:34<6:05:02,  7.85s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2892/5680 [7:33:42<6:05:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4069', 'grad_norm': '0.2833', 'learning_rate': '9.718e-05', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23691264, 'tokens/trainable': 23436208, 'epoch': '4.045'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2892/5680 [7:33:42<6:05:05,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2893/5680 [7:33:49<6:05:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5535', 'grad_norm': '0.3317', 'learning_rate': '9.712e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 23699456, 'tokens/trainable': 23444340, 'epoch': '4.045'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2893/5680 [7:33:49<6:05:13,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2894/5680 [7:33:57<6:04:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6585', 'grad_norm': '0.3318', 'learning_rate': '9.707e-05', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23707648, 'tokens/trainable': 23452498, 'epoch': '4.045'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2894/5680 [7:33:57<6:04:42,  7.85s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2895/5680 [7:34:05<6:04:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5677', 'grad_norm': '0.3572', 'learning_rate': '9.701e-05', 'ppl': '1.764', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23715840, 'tokens/trainable': 23460662, 'epoch': '4.045'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 2895/5680 [7:34:05<6:04:57,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2896/5680 [7:34:13<6:04:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5485', 'grad_norm': '0.4576', 'learning_rate': '9.696e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 23724032, 'tokens/trainable': 23468854, 'epoch': '4.045'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2896/5680 [7:34:13<6:04:52,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2897/5680 [7:34:21<6:04:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5156', 'grad_norm': '0.3085', 'learning_rate': '9.69e-05', 'ppl': '1.675', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 23732224, 'tokens/trainable': 23476960, 'epoch': '4.045'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2897/5680 [7:34:21<6:04:25,  7.86s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2898/5680 [7:34:29<6:05:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3409', 'grad_norm': '0.2607', 'learning_rate': '9.685e-05', 'ppl': '1.406', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 23740416, 'tokens/trainable': 23485068, 'epoch': '4.046'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2898/5680 [7:34:29<6:05:10,  7.88s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2899/5680 [7:34:37<6:05:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5969', 'grad_norm': '0.3363', 'learning_rate': '9.679e-05', 'ppl': '1.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 23748608, 'tokens/trainable': 23493176, 'epoch': '4.046'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 2899/5680 [7:34:37<6:05:10,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 2900/5680 [7:34:44<6:04:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4641', 'grad_norm': '0.3282', 'learning_rate': '9.674e-05', 'ppl': '1.591', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23756800, 'tokens/trainable': 23501332, 'epoch': '4.046'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 2900/5680 [7:34:44<6:04:25,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 2901/5680 [7:34:52<6:04:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4103', 'grad_norm': '0.3557', 'learning_rate': '9.668e-05', 'ppl': '1.507', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23764992, 'tokens/trainable': 23509504, 'epoch': '4.046'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 2901/5680 [7:34:52<6:04:18,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 2902/5680 [7:35:00<6:04:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.517', 'grad_norm': '0.2958', 'learning_rate': '9.663e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 23773184, 'tokens/trainable': 23517624, 'epoch': '4.046'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 2902/5680 [7:35:00<6:04:28,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2903/5680 [7:35:08<6:04:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5279', 'grad_norm': '0.3572', 'learning_rate': '9.657e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23781376, 'tokens/trainable': 23525804, 'epoch': '4.046'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2903/5680 [7:35:08<6:04:29,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2904/5680 [7:35:16<6:04:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4906', 'grad_norm': '0.3882', 'learning_rate': '9.652e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23789568, 'tokens/trainable': 23533930, 'epoch': '4.047'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2904/5680 [7:35:16<6:04:02,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2905/5680 [7:35:24<6:03:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6283', 'grad_norm': '0.3427', 'learning_rate': '9.646e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23797760, 'tokens/trainable': 23542080, 'epoch': '4.047'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2905/5680 [7:35:24<6:03:39,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2906/5680 [7:35:32<6:03:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.366', 'grad_norm': '0.2907', 'learning_rate': '9.641e-05', 'ppl': '1.442', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23805952, 'tokens/trainable': 23550264, 'epoch': '4.047'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 2906/5680 [7:35:32<6:03:36,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2907/5680 [7:35:40<6:03:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7315', 'grad_norm': '0.3658', 'learning_rate': '9.635e-05', 'ppl': '2.078', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23814144, 'tokens/trainable': 23558422, 'epoch': '4.047'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2907/5680 [7:35:40<6:03:16,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2908/5680 [7:35:47<6:03:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5061', 'grad_norm': '0.3342', 'learning_rate': '9.63e-05', 'ppl': '1.659', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23822336, 'tokens/trainable': 23566608, 'epoch': '4.047'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2908/5680 [7:35:47<6:03:53,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2909/5680 [7:35:55<6:03:45,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.501', 'grad_norm': '0.4447', 'learning_rate': '9.624e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23830528, 'tokens/trainable': 23574784, 'epoch': '4.048'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2909/5680 [7:35:55<6:03:45,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2910/5680 [7:36:03<6:03:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5204', 'grad_norm': '0.4169', 'learning_rate': '9.618e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23838720, 'tokens/trainable': 23582950, 'epoch': '4.048'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 2910/5680 [7:36:03<6:03:17,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 2911/5680 [7:36:11<6:03:54,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6523', 'grad_norm': '0.3352', 'learning_rate': '9.613e-05', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 23846912, 'tokens/trainable': 23591038, 'epoch': '4.048'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 2911/5680 [7:36:11<6:03:54,  7.89s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 2912/5680 [7:36:19<6:03:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3599', 'grad_norm': '0.3166', 'learning_rate': '9.607e-05', 'ppl': '1.433', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 23855104, 'tokens/trainable': 23599194, 'epoch': '4.048'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 2912/5680 [7:36:19<6:03:27,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 2913/5680 [7:36:27<6:03:12,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5252', 'grad_norm': '0.3699', 'learning_rate': '9.602e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23863296, 'tokens/trainable': 23607370, 'epoch': '4.048'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 2913/5680 [7:36:27<6:03:12,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2914/5680 [7:36:35<6:02:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3499', 'grad_norm': '0.2893', 'learning_rate': '9.596e-05', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23871488, 'tokens/trainable': 23615520, 'epoch': '4.048'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2914/5680 [7:36:35<6:02:51,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2915/5680 [7:36:43<6:02:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.602', 'grad_norm': '0.3538', 'learning_rate': '9.591e-05', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23879680, 'tokens/trainable': 23623636, 'epoch': '4.049'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2915/5680 [7:36:43<6:02:16,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2916/5680 [7:36:50<6:01:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5707', 'grad_norm': '0.3519', 'learning_rate': '9.585e-05', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 23887872, 'tokens/trainable': 23631772, 'epoch': '4.049'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2916/5680 [7:36:50<6:01:57,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2917/5680 [7:36:58<6:02:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.513', 'grad_norm': '0.3794', 'learning_rate': '9.58e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23896064, 'tokens/trainable': 23639952, 'epoch': '4.049'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                             | 2917/5680 [7:36:58<6:02:08,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2918/5680 [7:37:06<6:02:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5071', 'grad_norm': '0.3354', 'learning_rate': '9.574e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 23904256, 'tokens/trainable': 23648050, 'epoch': '4.049'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2918/5680 [7:37:06<6:02:02,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2919/5680 [7:37:14<6:02:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4009', 'grad_norm': '0.3429', 'learning_rate': '9.569e-05', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 23912448, 'tokens/trainable': 23656162, 'epoch': '4.049'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2919/5680 [7:37:14<6:02:30,  7.88s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2920/5680 [7:37:22<6:01:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4524', 'grad_norm': '0.3208', 'learning_rate': '9.563e-05', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 23920640, 'tokens/trainable': 23664286, 'epoch': '4.049'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2920/5680 [7:37:22<6:01:52,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2921/5680 [7:37:30<6:01:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5086', 'grad_norm': '0.3388', 'learning_rate': '9.558e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 23928832, 'tokens/trainable': 23672458, 'epoch': '4.05'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2921/5680 [7:37:30<6:01:43,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2922/5680 [7:37:38<6:01:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3694', 'grad_norm': '0.3484', 'learning_rate': '9.552e-05', 'ppl': '1.447', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 23937024, 'tokens/trainable': 23680556, 'epoch': '4.05'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2922/5680 [7:37:38<6:01:24,  7.86s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2923/5680 [7:37:45<6:01:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5594', 'grad_norm': '0.4074', 'learning_rate': '9.547e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23945216, 'tokens/trainable': 23688712, 'epoch': '4.05'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2923/5680 [7:37:45<6:01:34,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2924/5680 [7:37:53<6:01:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6106', 'grad_norm': '0.2976', 'learning_rate': '9.541e-05', 'ppl': '1.841', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 23953408, 'tokens/trainable': 23696852, 'epoch': '4.05'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2924/5680 [7:37:53<6:01:34,  7.87s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2925/5680 [7:38:01<6:00:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7193', 'grad_norm': '0.3814', 'learning_rate': '9.536e-05', 'ppl': '2.053', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23961600, 'tokens/trainable': 23705002, 'epoch': '4.05'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 2925/5680 [7:38:01<6:00:52,  7.86s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 2926/5680 [7:38:09<6:01:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3813', 'grad_norm': '0.3031', 'learning_rate': '9.53e-05', 'ppl': '1.464', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 23969792, 'tokens/trainable': 23713158, 'epoch': '4.051'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 2926/5680 [7:38:09<6:01:00,  7.87s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 2927/5680 [7:38:17<6:00:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6352', 'grad_norm': '0.3416', 'learning_rate': '9.525e-05', 'ppl': '1.887', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 23977984, 'tokens/trainable': 23721328, 'epoch': '4.051'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 2927/5680 [7:38:17<6:00:48,  7.86s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 2928/5680 [7:38:25<6:00:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4555', 'grad_norm': '0.3361', 'learning_rate': '9.519e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 23986176, 'tokens/trainable': 23729508, 'epoch': '4.051'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 2928/5680 [7:38:25<6:00:35,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2929/5680 [7:38:33<6:01:09,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2936', 'grad_norm': '0.3444', 'learning_rate': '9.513e-05', 'ppl': '1.341', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 23994368, 'tokens/trainable': 23737624, 'epoch': '4.051'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2929/5680 [7:38:33<6:01:09,  7.88s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2930/5680 [7:38:41<6:00:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5009', 'grad_norm': '0.3811', 'learning_rate': '9.508e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 24002560, 'tokens/trainable': 23745796, 'epoch': '4.051'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2930/5680 [7:38:41<6:00:28,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2931/5680 [7:38:48<6:00:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4674', 'grad_norm': '0.3879', 'learning_rate': '9.502e-05', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24010752, 'tokens/trainable': 23753956, 'epoch': '4.051'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2931/5680 [7:38:48<6:00:14,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2932/5680 [7:38:56<5:59:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4433', 'grad_norm': '0.3264', 'learning_rate': '9.497e-05', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24018944, 'tokens/trainable': 23762092, 'epoch': '4.052'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 2932/5680 [7:38:56<5:59:47,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2933/5680 [7:39:04<5:59:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5571', 'grad_norm': '0.3598', 'learning_rate': '9.491e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24027136, 'tokens/trainable': 23770220, 'epoch': '4.052'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2933/5680 [7:39:04<5:59:22,  7.85s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2934/5680 [7:39:12<6:04:07,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7102', 'grad_norm': '0.4292', 'learning_rate': '9.486e-05', 'ppl': '2.034', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.7', 'tokens/total': 24035328, 'tokens/trainable': 23778396, 'epoch': '4.052'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2934/5680 [7:39:12<6:04:07,  7.96s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2935/5680 [7:39:20<6:03:02,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3226', 'grad_norm': '0.2462', 'learning_rate': '9.48e-05', 'ppl': '1.381', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24043520, 'tokens/trainable': 23786548, 'epoch': '4.052'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2935/5680 [7:39:20<6:03:02,  7.94s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2936/5680 [7:39:28<6:02:09,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5681', 'grad_norm': '0.3429', 'learning_rate': '9.475e-05', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24051712, 'tokens/trainable': 23794726, 'epoch': '4.052'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 2936/5680 [7:39:28<6:02:09,  7.92s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 2937/5680 [7:39:36<6:00:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6891', 'grad_norm': '0.3756', 'learning_rate': '9.469e-05', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 24059904, 'tokens/trainable': 23802894, 'epoch': '4.052'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 2937/5680 [7:39:36<6:00:44,  7.89s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 2938/5680 [7:39:44<6:00:20,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3643', 'grad_norm': '0.3024', 'learning_rate': '9.464e-05', 'ppl': '1.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24068096, 'tokens/trainable': 23811020, 'epoch': '4.053'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 2938/5680 [7:39:44<6:00:20,  7.89s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 2939/5680 [7:39:52<5:59:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5712', 'grad_norm': '0.3472', 'learning_rate': '9.458e-05', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24076288, 'tokens/trainable': 23819170, 'epoch': '4.053'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 2939/5680 [7:39:52<5:59:42,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2940/5680 [7:39:59<5:59:39,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3767', 'grad_norm': '0.3163', 'learning_rate': '9.453e-05', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 24084480, 'tokens/trainable': 23827250, 'epoch': '4.053'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2940/5680 [7:39:59<5:59:39,  7.88s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2941/5680 [7:40:07<5:59:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6468', 'grad_norm': '0.3364', 'learning_rate': '9.447e-05', 'ppl': '1.909', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24092672, 'tokens/trainable': 23835420, 'epoch': '4.053'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2941/5680 [7:40:07<5:59:09,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2942/5680 [7:40:15<5:59:53,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5001', 'grad_norm': '0.4349', 'learning_rate': '9.442e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 24100864, 'tokens/trainable': 23843580, 'epoch': '4.053'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2942/5680 [7:40:15<5:59:53,  7.89s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2943/5680 [7:40:23<5:59:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6084', 'grad_norm': '0.3285', 'learning_rate': '9.436e-05', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 24109056, 'tokens/trainable': 23851688, 'epoch': '4.054'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 2943/5680 [7:40:23<5:59:04,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2944/5680 [7:40:31<5:58:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4272', 'grad_norm': '0.3231', 'learning_rate': '9.431e-05', 'ppl': '1.533', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 24117248, 'tokens/trainable': 23859760, 'epoch': '4.054'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2944/5680 [7:40:31<5:58:53,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2945/5680 [7:40:39<5:58:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6264', 'grad_norm': '0.3866', 'learning_rate': '9.425e-05', 'ppl': '1.871', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24125440, 'tokens/trainable': 23867876, 'epoch': '4.054'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2945/5680 [7:40:39<5:58:16,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2946/5680 [7:40:47<5:58:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7901', 'grad_norm': '0.411', 'learning_rate': '9.42e-05', 'ppl': '2.204', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24133632, 'tokens/trainable': 23876052, 'epoch': '4.054'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2946/5680 [7:40:47<5:58:17,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2947/5680 [7:40:54<5:57:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5242', 'grad_norm': '0.3111', 'learning_rate': '9.414e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24141824, 'tokens/trainable': 23884184, 'epoch': '4.054'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 2947/5680 [7:40:54<5:57:58,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 2948/5680 [7:41:02<5:58:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4819', 'grad_norm': '0.3267', 'learning_rate': '9.409e-05', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 24150016, 'tokens/trainable': 23892318, 'epoch': '4.054'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 2948/5680 [7:41:02<5:58:32,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 2949/5680 [7:41:10<5:57:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.3203', 'learning_rate': '9.403e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 24158208, 'tokens/trainable': 23900428, 'epoch': '4.055'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 2949/5680 [7:41:10<5:57:54,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 2950/5680 [7:41:18<5:58:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8014', 'grad_norm': '0.3615', 'learning_rate': '9.397e-05', 'ppl': '2.229', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 24166400, 'tokens/trainable': 23908524, 'epoch': '4.055'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 2950/5680 [7:41:18<5:58:05,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2951/5680 [7:41:26<5:57:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.667', 'grad_norm': '0.3625', 'learning_rate': '9.392e-05', 'ppl': '1.948', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 24174592, 'tokens/trainable': 23916692, 'epoch': '4.055'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2951/5680 [7:41:26<5:57:27,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2952/5680 [7:41:34<5:57:08,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6399', 'grad_norm': '0.3294', 'learning_rate': '9.386e-05', 'ppl': '1.896', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24182784, 'tokens/trainable': 23924804, 'epoch': '4.055'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2952/5680 [7:41:34<5:57:08,  7.85s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2953/5680 [7:41:42<5:57:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3502', 'grad_norm': '0.3144', 'learning_rate': '9.381e-05', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24190976, 'tokens/trainable': 23932984, 'epoch': '4.055'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2953/5680 [7:41:42<5:57:36,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2954/5680 [7:41:50<5:57:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8286', 'grad_norm': '0.3536', 'learning_rate': '9.375e-05', 'ppl': '2.29', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24199168, 'tokens/trainable': 23941140, 'epoch': '4.055'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 2954/5680 [7:41:50<5:57:24,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2955/5680 [7:41:57<5:57:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7161', 'grad_norm': '0.3774', 'learning_rate': '9.37e-05', 'ppl': '2.046', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24207360, 'tokens/trainable': 23949294, 'epoch': '4.056'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2955/5680 [7:41:57<5:57:12,  7.87s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2956/5680 [7:42:05<5:56:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6079', 'grad_norm': '0.3808', 'learning_rate': '9.364e-05', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 24215552, 'tokens/trainable': 23957388, 'epoch': '4.056'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2956/5680 [7:42:05<5:56:51,  7.86s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2957/5680 [7:42:13<5:56:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5332', 'grad_norm': '0.397', 'learning_rate': '9.359e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24223744, 'tokens/trainable': 23965548, 'epoch': '4.056'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2957/5680 [7:42:13<5:56:26,  7.85s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2958/5680 [7:42:21<5:56:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6378', 'grad_norm': '0.3545', 'learning_rate': '9.353e-05', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24231936, 'tokens/trainable': 23973712, 'epoch': '4.056'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 2958/5680 [7:42:21<5:56:33,  7.86s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2959/5680 [7:42:29<5:56:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4351', 'grad_norm': '0.3713', 'learning_rate': '9.348e-05', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 24240128, 'tokens/trainable': 23981888, 'epoch': '4.056'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2959/5680 [7:42:29<5:56:17,  7.86s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2960/5680 [7:42:37<5:56:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5384', 'grad_norm': '0.3861', 'learning_rate': '9.342e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 24248320, 'tokens/trainable': 23990008, 'epoch': '4.057'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2960/5680 [7:42:37<5:56:01,  7.85s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2961/5680 [7:42:45<5:55:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5503', 'grad_norm': '0.3462', 'learning_rate': '9.337e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24256512, 'tokens/trainable': 23998130, 'epoch': '4.057'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2961/5680 [7:42:45<5:55:32,  7.85s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2962/5680 [7:42:52<5:55:47,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4755', 'grad_norm': '0.3644', 'learning_rate': '9.331e-05', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24264704, 'tokens/trainable': 24006304, 'epoch': '4.057'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2962/5680 [7:42:52<5:55:47,  7.85s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                           | 2963/5680 [7:43:00<5:55:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.711', 'grad_norm': '0.4213', 'learning_rate': '9.326e-05', 'ppl': '2.036', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24272896, 'tokens/trainable': 24014398, 'epoch': '4.057'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                           | 2963/5680 [7:43:00<5:55:23,  7.85s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                           | 2964/5680 [7:43:08<5:54:58,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6345', 'grad_norm': '0.3925', 'learning_rate': '9.32e-05', 'ppl': '1.886', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24281088, 'tokens/trainable': 24022484, 'epoch': '4.057'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                           | 2964/5680 [7:43:08<5:54:58,  7.84s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                           | 2965/5680 [7:43:16<5:55:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4791', 'grad_norm': '0.3258', 'learning_rate': '9.315e-05', 'ppl': '1.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24289280, 'tokens/trainable': 24030648, 'epoch': '4.057'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                           | 2965/5680 [7:43:16<5:55:16,  7.85s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2966/5680 [7:43:24<5:55:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4053', 'grad_norm': '0.3112', 'learning_rate': '9.309e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24297472, 'tokens/trainable': 24038778, 'epoch': '4.058'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2966/5680 [7:43:24<5:55:26,  7.86s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2967/5680 [7:43:32<5:55:09,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4413', 'grad_norm': '0.3604', 'learning_rate': '9.304e-05', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24305664, 'tokens/trainable': 24046920, 'epoch': '4.058'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2967/5680 [7:43:32<5:55:09,  7.85s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2968/5680 [7:43:40<5:58:56,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5121', 'grad_norm': '0.3626', 'learning_rate': '9.298e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 24313856, 'tokens/trainable': 24055070, 'epoch': '4.058'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2968/5680 [7:43:40<5:58:56,  7.94s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2969/5680 [7:43:48<5:58:02,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4355', 'grad_norm': '0.3718', 'learning_rate': '9.293e-05', 'ppl': '1.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24322048, 'tokens/trainable': 24063232, 'epoch': '4.058'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 2969/5680 [7:43:48<5:58:02,  7.92s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2970/5680 [7:43:56<5:56:57,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6496', 'grad_norm': '0.3849', 'learning_rate': '9.287e-05', 'ppl': '1.915', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 24330240, 'tokens/trainable': 24071336, 'epoch': '4.058'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2970/5680 [7:43:56<5:56:57,  7.90s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2971/5680 [7:44:03<5:56:22,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5396', 'grad_norm': '0.3548', 'learning_rate': '9.282e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 24338432, 'tokens/trainable': 24079454, 'epoch': '4.058'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2971/5680 [7:44:03<5:56:22,  7.89s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2972/5680 [7:44:11<5:55:47,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.502', 'grad_norm': '0.3436', 'learning_rate': '9.276e-05', 'ppl': '1.652', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24346624, 'tokens/trainable': 24087600, 'epoch': '4.059'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2972/5680 [7:44:11<5:55:47,  7.88s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2973/5680 [7:44:19<5:55:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5788', 'grad_norm': '0.4248', 'learning_rate': '9.271e-05', 'ppl': '1.784', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24354816, 'tokens/trainable': 24095754, 'epoch': '4.059'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 2973/5680 [7:44:19<5:55:23,  7.88s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2974/5680 [7:44:27<5:54:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5147', 'grad_norm': '0.3838', 'learning_rate': '9.265e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 24363008, 'tokens/trainable': 24103916, 'epoch': '4.059'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2974/5680 [7:44:27<5:54:51,  7.87s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2975/5680 [7:44:35<5:54:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3658', 'grad_norm': '0.2903', 'learning_rate': '9.26e-05', 'ppl': '1.442', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 24371200, 'tokens/trainable': 24112006, 'epoch': '4.059'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2975/5680 [7:44:35<5:54:40,  7.87s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2976/5680 [7:44:43<5:54:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4208', 'grad_norm': '0.3355', 'learning_rate': '9.254e-05', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24379392, 'tokens/trainable': 24120116, 'epoch': '4.059'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2976/5680 [7:44:43<5:54:12,  7.86s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2977/5680 [7:44:51<5:54:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4139', 'grad_norm': '0.4163', 'learning_rate': '9.248e-05', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24387584, 'tokens/trainable': 24128296, 'epoch': '4.06'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2977/5680 [7:44:51<5:54:05,  7.86s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2978/5680 [7:44:58<5:53:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4757', 'grad_norm': '0.3633', 'learning_rate': '9.243e-05', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 24395776, 'tokens/trainable': 24136402, 'epoch': '4.06'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2978/5680 [7:44:58<5:53:52,  7.86s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2979/5680 [7:45:06<5:54:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3939', 'grad_norm': '0.3822', 'learning_rate': '9.237e-05', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24403968, 'tokens/trainable': 24144562, 'epoch': '4.06'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2979/5680 [7:45:06<5:54:22,  7.87s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2980/5680 [7:45:14<5:54:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5534', 'grad_norm': '0.4499', 'learning_rate': '9.232e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24412160, 'tokens/trainable': 24152724, 'epoch': '4.06'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                           | 2980/5680 [7:45:14<5:54:13,  7.87s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2981/5680 [7:45:22<5:54:16,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.396', 'grad_norm': '0.2917', 'learning_rate': '9.226e-05', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24420352, 'tokens/trainable': 24160896, 'epoch': '4.06'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2981/5680 [7:45:22<5:54:16,  7.88s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2982/5680 [7:45:30<5:54:22,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4528', 'grad_norm': '0.3587', 'learning_rate': '9.221e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 24428544, 'tokens/trainable': 24169024, 'epoch': '4.06'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2982/5680 [7:45:30<5:54:22,  7.88s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2983/5680 [7:45:38<5:54:02,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6104', 'grad_norm': '0.4337', 'learning_rate': '9.215e-05', 'ppl': '1.841', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24436736, 'tokens/trainable': 24177192, 'epoch': '4.061'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2983/5680 [7:45:38<5:54:02,  7.88s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2984/5680 [7:45:46<5:54:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3655', 'grad_norm': '0.3676', 'learning_rate': '9.21e-05', 'ppl': '1.441', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24444928, 'tokens/trainable': 24185380, 'epoch': '4.061'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 2984/5680 [7:45:46<5:54:11,  7.88s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 2985/5680 [7:45:54<5:53:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8146', 'grad_norm': '0.5016', 'learning_rate': '9.204e-05', 'ppl': '2.258', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 24453120, 'tokens/trainable': 24193504, 'epoch': '4.061'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 2985/5680 [7:45:54<5:53:39,  7.87s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 2986/5680 [7:46:01<5:53:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5702', 'grad_norm': '0.3671', 'learning_rate': '9.199e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 24461312, 'tokens/trainable': 24201694, 'epoch': '4.061'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 2986/5680 [7:46:01<5:53:10,  7.87s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 2987/5680 [7:46:09<5:53:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3559', 'grad_norm': '0.3213', 'learning_rate': '9.193e-05', 'ppl': '1.427', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24469504, 'tokens/trainable': 24209862, 'epoch': '4.061'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 2987/5680 [7:46:09<5:53:16,  7.87s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2988/5680 [7:46:17<5:52:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4182', 'grad_norm': '0.3419', 'learning_rate': '9.188e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 24477696, 'tokens/trainable': 24218030, 'epoch': '4.061'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2988/5680 [7:46:17<5:52:53,  7.87s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2989/5680 [7:46:25<5:53:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5322', 'grad_norm': '0.3523', 'learning_rate': '9.182e-05', 'ppl': '1.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24485888, 'tokens/trainable': 24226196, 'epoch': '4.062'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2989/5680 [7:46:25<5:53:03,  7.87s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2990/5680 [7:46:33<5:53:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4334', 'grad_norm': '0.4148', 'learning_rate': '9.177e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 24494080, 'tokens/trainable': 24234336, 'epoch': '4.062'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2990/5680 [7:46:33<5:53:27,  7.88s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2991/5680 [7:46:41<5:52:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4014', 'grad_norm': '0.4208', 'learning_rate': '9.171e-05', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 24502272, 'tokens/trainable': 24242426, 'epoch': '4.062'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 2991/5680 [7:46:41<5:52:45,  7.87s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2992/5680 [7:46:49<5:52:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4443', 'grad_norm': '0.386', 'learning_rate': '9.166e-05', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 24510464, 'tokens/trainable': 24250584, 'epoch': '4.062'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2992/5680 [7:46:49<5:52:17,  7.86s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2993/5680 [7:46:56<5:51:41,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4802', 'grad_norm': '0.3479', 'learning_rate': '9.16e-05', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24518656, 'tokens/trainable': 24258708, 'epoch': '4.062'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2993/5680 [7:46:56<5:51:41,  7.85s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2994/5680 [7:47:04<5:51:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6573', 'grad_norm': '0.3751', 'learning_rate': '9.155e-05', 'ppl': '1.93', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 24526848, 'tokens/trainable': 24266886, 'epoch': '4.062'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2994/5680 [7:47:04<5:51:37,  7.85s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2995/5680 [7:47:12<5:51:17,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5475', 'grad_norm': '0.3435', 'learning_rate': '9.149e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24535040, 'tokens/trainable': 24275024, 'epoch': '4.063'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2995/5680 [7:47:12<5:51:17,  7.85s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2996/5680 [7:47:20<5:51:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5755', 'grad_norm': '0.3619', 'learning_rate': '9.144e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 24543232, 'tokens/trainable': 24283118, 'epoch': '4.063'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2996/5680 [7:47:20<5:51:48,  7.86s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2997/5680 [7:47:28<5:51:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4687', 'grad_norm': '0.3163', 'learning_rate': '9.138e-05', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 24551424, 'tokens/trainable': 24291296, 'epoch': '4.063'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2997/5680 [7:47:28<5:51:29,  7.86s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2998/5680 [7:47:36<5:51:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5926', 'grad_norm': '0.3511', 'learning_rate': '9.133e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 24559616, 'tokens/trainable': 24299402, 'epoch': '4.063'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2998/5680 [7:47:36<5:51:45,  7.87s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2999/5680 [7:47:44<5:51:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5049', 'grad_norm': '0.4548', 'learning_rate': '9.127e-05', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24567808, 'tokens/trainable': 24307544, 'epoch': '4.063'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 2999/5680 [7:47:44<5:51:17,  7.86s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                          | 3000/5680 [7:47:51<5:51:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5479', 'grad_norm': '0.3388', 'learning_rate': '9.122e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24576000, 'tokens/trainable': 24315678, 'epoch': '4.064'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                          | 3000/5680 [7:47:51<5:51:01,  7.86s/it][2026-01-27 05:37:05,539] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 05:37:50,726] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out/checkpoint-3000
[2026-01-27 05:38:44,605] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 05:38:44,605] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3001/5680 [7:49:39<28:06:56, 37.78s/it]                                                                                                                                                                                                                                             {'loss': '0.377', 'grad_norm': '0.3611', 'learning_rate': '9.116e-05', 'ppl': '1.458', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989', 'tokens/total': 24584192, 'tokens/trainable': 24323852, 'epoch': '4.064'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3001/5680 [7:49:39<28:06:56, 37.78s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3002/5680 [7:49:47<21:25:40, 28.81s/it]                                                                                                                                                                                                                                             {'loss': '0.745', 'grad_norm': '0.4527', 'learning_rate': '9.111e-05', 'ppl': '2.107', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24592384, 'tokens/trainable': 24332018, 'epoch': '4.064'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3002/5680 [7:49:47<21:25:40, 28.81s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3003/5680 [7:49:55<16:48:42, 22.61s/it]                                                                                                                                                                                                                                             {'loss': '0.7906', 'grad_norm': '0.4352', 'learning_rate': '9.105e-05', 'ppl': '2.205', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 24600576, 'tokens/trainable': 24340204, 'epoch': '4.064'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3003/5680 [7:49:55<16:48:42, 22.61s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3004/5680 [7:50:03<13:30:57, 18.18s/it]                                                                                                                                                                                                                                             {'loss': '0.5357', 'grad_norm': '0.3178', 'learning_rate': '9.1e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24608768, 'tokens/trainable': 24348324, 'epoch': '4.064'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3004/5680 [7:50:03<13:30:57, 18.18s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3005/5680 [7:50:11<11:12:46, 15.09s/it]                                                                                                                                                                                                                                             {'loss': '0.6279', 'grad_norm': '0.3741', 'learning_rate': '9.094e-05', 'ppl': '1.874', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 24616960, 'tokens/trainable': 24356502, 'epoch': '4.064'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3005/5680 [7:50:11<11:12:46, 15.09s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 3006/5680 [7:50:19<9:36:18, 12.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6117', 'grad_norm': '0.3483', 'learning_rate': '9.089e-05', 'ppl': '1.843', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 24625152, 'tokens/trainable': 24364620, 'epoch': '4.065'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 3006/5680 [7:50:19<9:36:18, 12.93s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3007/5680 [7:50:27<8:28:21, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.5617', 'grad_norm': '0.3171', 'learning_rate': '9.083e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 24633344, 'tokens/trainable': 24372760, 'epoch': '4.065'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3007/5680 [7:50:27<8:28:21, 11.41s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3008/5680 [7:50:34<7:41:04, 10.35s/it]                                                                                                                                                                                                                                             {'loss': '0.6165', 'grad_norm': '0.3671', 'learning_rate': '9.078e-05', 'ppl': '1.852', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24641536, 'tokens/trainable': 24380948, 'epoch': '4.065'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3008/5680 [7:50:34<7:41:04, 10.35s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3009/5680 [7:50:42<7:07:38,  9.61s/it]                                                                                                                                                                                                                                             {'loss': '0.6445', 'grad_norm': '0.3704', 'learning_rate': '9.072e-05', 'ppl': '1.905', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24649728, 'tokens/trainable': 24389070, 'epoch': '4.065'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3009/5680 [7:50:42<7:07:38,  9.61s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3010/5680 [7:50:50<6:44:26,  9.09s/it]                                                                                                                                                                                                                                             {'loss': '0.6209', 'grad_norm': '0.4193', 'learning_rate': '9.067e-05', 'ppl': '1.861', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24657920, 'tokens/trainable': 24397230, 'epoch': '4.065'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 3010/5680 [7:50:50<6:44:26,  9.09s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 3011/5680 [7:50:58<6:27:38,  8.71s/it]                                                                                                                                                                                                                                             {'loss': '0.4214', 'grad_norm': '0.3733', 'learning_rate': '9.061e-05', 'ppl': '1.524', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24666112, 'tokens/trainable': 24405364, 'epoch': '4.065'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 3011/5680 [7:50:58<6:27:38,  8.71s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 3012/5680 [7:51:06<6:16:04,  8.46s/it]                                                                                                                                                                                                                                             {'loss': '0.3325', 'grad_norm': '0.3357', 'learning_rate': '9.056e-05', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 24674304, 'tokens/trainable': 24413460, 'epoch': '4.066'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 3012/5680 [7:51:06<6:16:04,  8.46s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 3013/5680 [7:51:14<6:08:15,  8.28s/it]                                                                                                                                                                                                                                             {'loss': '0.5054', 'grad_norm': '0.3568', 'learning_rate': '9.05e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24682496, 'tokens/trainable': 24421632, 'epoch': '4.066'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 3013/5680 [7:51:14<6:08:15,  8.28s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3014/5680 [7:51:22<6:02:29,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.6114', 'grad_norm': '0.4203', 'learning_rate': '9.045e-05', 'ppl': '1.843', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 24690688, 'tokens/trainable': 24429730, 'epoch': '4.066'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3014/5680 [7:51:22<6:02:29,  8.16s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3015/5680 [7:51:29<5:57:58,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.545', 'grad_norm': '0.3101', 'learning_rate': '9.039e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 24698880, 'tokens/trainable': 24437920, 'epoch': '4.066'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3015/5680 [7:51:29<5:57:58,  8.06s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3016/5680 [7:51:37<5:55:15,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3863', 'grad_norm': '0.3578', 'learning_rate': '9.034e-05', 'ppl': '1.472', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24707072, 'tokens/trainable': 24446088, 'epoch': '4.066'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3016/5680 [7:51:37<5:55:15,  8.00s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3017/5680 [7:51:45<5:52:26,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6974', 'grad_norm': '0.3669', 'learning_rate': '9.028e-05', 'ppl': '2.008', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24715264, 'tokens/trainable': 24454164, 'epoch': '4.067'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 3017/5680 [7:51:45<5:52:26,  7.94s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3018/5680 [7:51:53<5:50:56,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.707', 'grad_norm': '0.3952', 'learning_rate': '9.023e-05', 'ppl': '2.028', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 24723456, 'tokens/trainable': 24462256, 'epoch': '4.067'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3018/5680 [7:51:53<5:50:56,  7.91s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3019/5680 [7:52:01<5:49:55,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.677', 'grad_norm': '0.3194', 'learning_rate': '9.017e-05', 'ppl': '1.968', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 24731648, 'tokens/trainable': 24470432, 'epoch': '4.067'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3019/5680 [7:52:01<5:49:55,  7.89s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3020/5680 [7:52:09<5:48:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7436', 'grad_norm': '0.35', 'learning_rate': '9.012e-05', 'ppl': '2.104', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 24739840, 'tokens/trainable': 24478600, 'epoch': '4.067'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3020/5680 [7:52:09<5:48:50,  7.87s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3021/5680 [7:52:17<5:48:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4643', 'grad_norm': '0.332', 'learning_rate': '9.006e-05', 'ppl': '1.591', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24748032, 'tokens/trainable': 24486768, 'epoch': '4.067'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 3021/5680 [7:52:17<5:48:28,  7.86s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 3022/5680 [7:52:24<5:48:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.469', 'grad_norm': '0.3057', 'learning_rate': '9.001e-05', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 24756224, 'tokens/trainable': 24494876, 'epoch': '4.067'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 3022/5680 [7:52:24<5:48:53,  7.88s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 3023/5680 [7:52:32<5:47:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6617', 'grad_norm': '0.354', 'learning_rate': '8.995e-05', 'ppl': '1.938', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 24764416, 'tokens/trainable': 24502960, 'epoch': '4.068'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 3023/5680 [7:52:32<5:47:55,  7.86s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 3024/5680 [7:52:40<5:47:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7016', 'grad_norm': '0.3431', 'learning_rate': '8.99e-05', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24772608, 'tokens/trainable': 24511088, 'epoch': '4.068'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 3024/5680 [7:52:40<5:47:38,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3025/5680 [7:52:48<5:47:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6436', 'grad_norm': '0.3479', 'learning_rate': '8.984e-05', 'ppl': '1.903', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 24780800, 'tokens/trainable': 24519188, 'epoch': '4.068'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3025/5680 [7:52:48<5:47:31,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3026/5680 [7:52:56<5:47:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4798', 'grad_norm': '0.3447', 'learning_rate': '8.979e-05', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 24788992, 'tokens/trainable': 24527316, 'epoch': '4.068'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3026/5680 [7:52:56<5:47:04,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3027/5680 [7:53:04<5:47:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4859', 'grad_norm': '0.3038', 'learning_rate': '8.973e-05', 'ppl': '1.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 24797184, 'tokens/trainable': 24535414, 'epoch': '4.068'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3027/5680 [7:53:04<5:47:12,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3028/5680 [7:53:11<5:46:45,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6919', 'grad_norm': '0.4795', 'learning_rate': '8.968e-05', 'ppl': '1.997', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 24805376, 'tokens/trainable': 24543582, 'epoch': '4.068'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 3028/5680 [7:53:11<5:46:45,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3029/5680 [7:53:19<5:46:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6217', 'grad_norm': '0.4076', 'learning_rate': '8.962e-05', 'ppl': '1.862', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24813568, 'tokens/trainable': 24551710, 'epoch': '4.069'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3029/5680 [7:53:19<5:46:37,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3030/5680 [7:53:27<5:46:23,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.515', 'grad_norm': '0.3129', 'learning_rate': '8.957e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24821760, 'tokens/trainable': 24559816, 'epoch': '4.069'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3030/5680 [7:53:27<5:46:23,  7.84s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3031/5680 [7:53:35<5:46:10,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5047', 'grad_norm': '0.3071', 'learning_rate': '8.951e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 24829952, 'tokens/trainable': 24567964, 'epoch': '4.069'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3031/5680 [7:53:35<5:46:10,  7.84s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3032/5680 [7:53:43<5:46:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6431', 'grad_norm': '0.4565', 'learning_rate': '8.946e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24838144, 'tokens/trainable': 24576148, 'epoch': '4.069'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 3032/5680 [7:53:43<5:46:31,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 3033/5680 [7:53:51<5:46:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.8775', 'grad_norm': '0.416', 'learning_rate': '8.94e-05', 'ppl': '2.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24846336, 'tokens/trainable': 24584310, 'epoch': '4.069'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 3033/5680 [7:53:51<5:46:29,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 3034/5680 [7:53:59<5:46:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5961', 'grad_norm': '0.3257', 'learning_rate': '8.935e-05', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 24854528, 'tokens/trainable': 24592452, 'epoch': '4.07'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 3034/5680 [7:53:59<5:46:11,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 3035/5680 [7:54:06<5:45:45,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7589', 'grad_norm': '0.3979', 'learning_rate': '8.929e-05', 'ppl': '2.136', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24862720, 'tokens/trainable': 24600596, 'epoch': '4.07'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 3035/5680 [7:54:06<5:45:45,  7.84s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3036/5680 [7:54:14<5:45:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3184', 'grad_norm': '0.2685', 'learning_rate': '8.924e-05', 'ppl': '1.375', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24870912, 'tokens/trainable': 24608718, 'epoch': '4.07'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3036/5680 [7:54:14<5:45:50,  7.85s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3037/5680 [7:54:22<5:46:20,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4177', 'grad_norm': '0.3884', 'learning_rate': '8.918e-05', 'ppl': '1.518', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 24879104, 'tokens/trainable': 24616808, 'epoch': '4.07'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3037/5680 [7:54:22<5:46:20,  7.86s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3038/5680 [7:54:30<5:46:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5862', 'grad_norm': '0.3965', 'learning_rate': '8.913e-05', 'ppl': '1.797', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24887296, 'tokens/trainable': 24624964, 'epoch': '4.07'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3038/5680 [7:54:30<5:46:17,  7.86s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3039/5680 [7:54:38<5:46:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4082', 'grad_norm': '0.2887', 'learning_rate': '8.907e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24895488, 'tokens/trainable': 24633120, 'epoch': '4.07'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 3039/5680 [7:54:38<5:46:15,  7.87s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3040/5680 [7:54:46<5:46:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5149', 'grad_norm': '0.3391', 'learning_rate': '8.902e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 24903680, 'tokens/trainable': 24641244, 'epoch': '4.071'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3040/5680 [7:54:46<5:46:18,  7.87s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3041/5680 [7:54:54<5:45:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7165', 'grad_norm': '0.3825', 'learning_rate': '8.896e-05', 'ppl': '2.047', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24911872, 'tokens/trainable': 24649368, 'epoch': '4.071'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3041/5680 [7:54:54<5:45:46,  7.86s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3042/5680 [7:55:01<5:45:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6479', 'grad_norm': '0.4159', 'learning_rate': '8.891e-05', 'ppl': '1.912', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 24920064, 'tokens/trainable': 24657492, 'epoch': '4.071'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3042/5680 [7:55:01<5:45:36,  7.86s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3043/5680 [7:55:09<5:45:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4682', 'grad_norm': '0.3828', 'learning_rate': '8.885e-05', 'ppl': '1.597', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24928256, 'tokens/trainable': 24665680, 'epoch': '4.071'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                         | 3043/5680 [7:55:09<5:45:33,  7.86s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3044/5680 [7:55:17<5:45:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.454', 'grad_norm': '0.3165', 'learning_rate': '8.88e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 24936448, 'tokens/trainable': 24673834, 'epoch': '4.071'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3044/5680 [7:55:17<5:45:29,  7.86s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3045/5680 [7:55:25<5:45:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6139', 'grad_norm': '0.3557', 'learning_rate': '8.874e-05', 'ppl': '1.848', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 24944640, 'tokens/trainable': 24682016, 'epoch': '4.071'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3045/5680 [7:55:25<5:45:23,  7.86s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3046/5680 [7:55:33<5:45:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.771', 'grad_norm': '0.5306', 'learning_rate': '8.869e-05', 'ppl': '2.162', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 24952832, 'tokens/trainable': 24690140, 'epoch': '4.072'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3046/5680 [7:55:33<5:45:36,  7.87s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3047/5680 [7:55:41<5:45:42,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3714', 'grad_norm': '0.4141', 'learning_rate': '8.863e-05', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 24961024, 'tokens/trainable': 24698252, 'epoch': '4.072'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 3047/5680 [7:55:41<5:45:42,  7.88s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 3048/5680 [7:55:49<5:45:51,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3843', 'grad_norm': '0.3379', 'learning_rate': '8.858e-05', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 24969216, 'tokens/trainable': 24706352, 'epoch': '4.072'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 3048/5680 [7:55:49<5:45:51,  7.88s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 3049/5680 [7:55:57<5:45:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4858', 'grad_norm': '0.3564', 'learning_rate': '8.852e-05', 'ppl': '1.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 24977408, 'tokens/trainable': 24714520, 'epoch': '4.072'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 3049/5680 [7:55:57<5:45:17,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 3050/5680 [7:56:04<5:45:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3777', 'grad_norm': '0.3084', 'learning_rate': '8.847e-05', 'ppl': '1.459', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 24985600, 'tokens/trainable': 24722628, 'epoch': '4.072'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 3050/5680 [7:56:04<5:45:06,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3051/5680 [7:56:12<5:44:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4057', 'grad_norm': '0.3506', 'learning_rate': '8.841e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 24993792, 'tokens/trainable': 24730746, 'epoch': '4.073'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3051/5680 [7:56:12<5:44:32,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3052/5680 [7:56:20<5:44:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4728', 'grad_norm': '0.3429', 'learning_rate': '8.836e-05', 'ppl': '1.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25001984, 'tokens/trainable': 24738932, 'epoch': '4.073'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3052/5680 [7:56:20<5:44:46,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3053/5680 [7:56:28<5:44:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5728', 'grad_norm': '0.338', 'learning_rate': '8.83e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 25010176, 'tokens/trainable': 24747092, 'epoch': '4.073'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3053/5680 [7:56:28<5:44:06,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3054/5680 [7:56:36<5:43:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4664', 'grad_norm': '0.3196', 'learning_rate': '8.825e-05', 'ppl': '1.594', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 25018368, 'tokens/trainable': 24755252, 'epoch': '4.073'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 3054/5680 [7:56:36<5:43:44,  7.85s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3055/5680 [7:56:44<5:43:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6128', 'grad_norm': '0.3214', 'learning_rate': '8.819e-05', 'ppl': '1.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 25026560, 'tokens/trainable': 24763300, 'epoch': '4.073'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3055/5680 [7:56:44<5:43:43,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3056/5680 [7:56:52<5:44:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4836', 'grad_norm': '0.513', 'learning_rate': '8.814e-05', 'ppl': '1.622', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 25034752, 'tokens/trainable': 24771384, 'epoch': '4.073'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3056/5680 [7:56:52<5:44:04,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3057/5680 [7:56:59<5:43:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6467', 'grad_norm': '0.3596', 'learning_rate': '8.808e-05', 'ppl': '1.909', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 25042944, 'tokens/trainable': 24779568, 'epoch': '4.074'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3057/5680 [7:56:59<5:43:32,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3058/5680 [7:57:07<5:43:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6458', 'grad_norm': '0.4043', 'learning_rate': '8.803e-05', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 25051136, 'tokens/trainable': 24787738, 'epoch': '4.074'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 3058/5680 [7:57:07<5:43:23,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 3059/5680 [7:57:15<5:43:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3962', 'grad_norm': '0.3552', 'learning_rate': '8.797e-05', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25059328, 'tokens/trainable': 24795866, 'epoch': '4.074'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 3059/5680 [7:57:15<5:43:12,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 3060/5680 [7:57:23<5:43:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8464', 'grad_norm': '0.4391', 'learning_rate': '8.792e-05', 'ppl': '2.331', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 25067520, 'tokens/trainable': 24804054, 'epoch': '4.074'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 3060/5680 [7:57:23<5:43:11,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 3061/5680 [7:57:31<5:43:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6929', 'grad_norm': '0.3607', 'learning_rate': '8.786e-05', 'ppl': '1.999', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 25075712, 'tokens/trainable': 24812160, 'epoch': '4.074'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 3061/5680 [7:57:31<5:43:16,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3062/5680 [7:57:39<5:42:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4328', 'grad_norm': '0.3416', 'learning_rate': '8.781e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25083904, 'tokens/trainable': 24820252, 'epoch': '4.074'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3062/5680 [7:57:39<5:42:28,  7.85s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3063/5680 [7:57:47<5:42:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3155', 'grad_norm': '0.2682', 'learning_rate': '8.775e-05', 'ppl': '1.371', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25092096, 'tokens/trainable': 24828404, 'epoch': '4.075'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3063/5680 [7:57:47<5:42:50,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3064/5680 [7:57:54<5:42:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5079', 'grad_norm': '0.3681', 'learning_rate': '8.77e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 25100288, 'tokens/trainable': 24836572, 'epoch': '4.075'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3064/5680 [7:57:54<5:42:29,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3065/5680 [7:58:02<5:42:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5262', 'grad_norm': '0.3289', 'learning_rate': '8.764e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25108480, 'tokens/trainable': 24844728, 'epoch': '4.075'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 3065/5680 [7:58:02<5:42:40,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3066/5680 [7:58:10<5:42:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8843', 'grad_norm': '0.5401', 'learning_rate': '8.759e-05', 'ppl': '2.421', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25116672, 'tokens/trainable': 24852880, 'epoch': '4.075'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3066/5680 [7:58:10<5:42:40,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3067/5680 [7:58:18<5:42:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3491', 'grad_norm': '0.3378', 'learning_rate': '8.753e-05', 'ppl': '1.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25124864, 'tokens/trainable': 24861040, 'epoch': '4.075'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3067/5680 [7:58:18<5:42:40,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3068/5680 [7:58:26<5:42:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5543', 'grad_norm': '0.4826', 'learning_rate': '8.748e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25133056, 'tokens/trainable': 24869192, 'epoch': '4.076'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3068/5680 [7:58:26<5:42:30,  7.87s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3069/5680 [7:58:34<5:41:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4034', 'grad_norm': '0.3106', 'learning_rate': '8.742e-05', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 25141248, 'tokens/trainable': 24877376, 'epoch': '4.076'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 3069/5680 [7:58:34<5:41:54,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 3070/5680 [7:58:42<5:41:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4026', 'grad_norm': '0.3398', 'learning_rate': '8.737e-05', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 25149440, 'tokens/trainable': 24885544, 'epoch': '4.076'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 3070/5680 [7:58:42<5:41:26,  7.85s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 3071/5680 [7:58:49<5:41:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5102', 'grad_norm': '0.4007', 'learning_rate': '8.731e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25157632, 'tokens/trainable': 24893720, 'epoch': '4.076'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 3071/5680 [7:58:49<5:41:48,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 3072/5680 [7:58:57<5:41:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7341', 'grad_norm': '0.4172', 'learning_rate': '8.726e-05', 'ppl': '2.084', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25165824, 'tokens/trainable': 24901840, 'epoch': '4.076'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 3072/5680 [7:58:57<5:41:29,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3073/5680 [7:59:05<5:41:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6518', 'grad_norm': '0.3173', 'learning_rate': '8.72e-05', 'ppl': '1.919', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 25174016, 'tokens/trainable': 24909980, 'epoch': '4.076'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3073/5680 [7:59:05<5:41:01,  7.85s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3074/5680 [7:59:13<5:40:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5174', 'grad_norm': '0.3221', 'learning_rate': '8.715e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 25182208, 'tokens/trainable': 24918020, 'epoch': '4.077'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3074/5680 [7:59:13<5:40:57,  7.85s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3075/5680 [7:59:21<5:41:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6726', 'grad_norm': '0.3858', 'learning_rate': '8.709e-05', 'ppl': '1.959', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25190400, 'tokens/trainable': 24926176, 'epoch': '4.077'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3075/5680 [7:59:21<5:41:19,  7.86s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3076/5680 [7:59:29<5:41:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3902', 'grad_norm': '0.3939', 'learning_rate': '8.704e-05', 'ppl': '1.477', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25198592, 'tokens/trainable': 24934344, 'epoch': '4.077'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 3076/5680 [7:59:29<5:41:42,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3077/5680 [7:59:37<5:41:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.69', 'grad_norm': '0.3731', 'learning_rate': '8.698e-05', 'ppl': '1.994', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25206784, 'tokens/trainable': 24942492, 'epoch': '4.077'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3077/5680 [7:59:37<5:41:24,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3078/5680 [7:59:45<5:40:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4421', 'grad_norm': '0.3699', 'learning_rate': '8.693e-05', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 25214976, 'tokens/trainable': 24950680, 'epoch': '4.077'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3078/5680 [7:59:45<5:40:56,  7.86s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3079/5680 [7:59:52<5:41:26,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3345', 'grad_norm': '0.4089', 'learning_rate': '8.687e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 25223168, 'tokens/trainable': 24958808, 'epoch': '4.077'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3079/5680 [7:59:52<5:41:26,  7.88s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3080/5680 [8:00:00<5:42:06,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6789', 'grad_norm': '0.4056', 'learning_rate': '8.682e-05', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 25231360, 'tokens/trainable': 24966964, 'epoch': '4.078'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                        | 3080/5680 [8:00:00<5:42:06,  7.89s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3081/5680 [8:00:08<5:41:33,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5498', 'grad_norm': '0.3819', 'learning_rate': '8.676e-05', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25239552, 'tokens/trainable': 24975116, 'epoch': '4.078'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3081/5680 [8:00:08<5:41:33,  7.89s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3082/5680 [8:00:16<5:40:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5261', 'grad_norm': '0.3346', 'learning_rate': '8.671e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 25247744, 'tokens/trainable': 24983200, 'epoch': '4.078'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3082/5680 [8:00:16<5:40:53,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3083/5680 [8:00:24<5:40:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6747', 'grad_norm': '0.3872', 'learning_rate': '8.665e-05', 'ppl': '1.963', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25255936, 'tokens/trainable': 24991312, 'epoch': '4.078'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3083/5680 [8:00:24<5:40:10,  7.86s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3084/5680 [8:00:32<5:40:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5913', 'grad_norm': '0.4088', 'learning_rate': '8.66e-05', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25264128, 'tokens/trainable': 24999482, 'epoch': '4.078'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 3084/5680 [8:00:32<5:40:23,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 3085/5680 [8:00:40<5:44:15,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5466', 'grad_norm': '0.3955', 'learning_rate': '8.655e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.5', 'tokens/total': 25272320, 'tokens/trainable': 25007626, 'epoch': '4.079'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 3085/5680 [8:00:40<5:44:15,  7.96s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 3086/5680 [8:00:48<5:42:24,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.9165', 'grad_norm': '0.414', 'learning_rate': '8.649e-05', 'ppl': '2.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25280512, 'tokens/trainable': 25015756, 'epoch': '4.079'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 3086/5680 [8:00:48<5:42:24,  7.92s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 3087/5680 [8:00:56<5:41:40,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4296', 'grad_norm': '0.301', 'learning_rate': '8.644e-05', 'ppl': '1.537', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 25288704, 'tokens/trainable': 25023856, 'epoch': '4.079'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 3087/5680 [8:00:56<5:41:40,  7.91s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3088/5680 [8:01:04<5:40:49,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5708', 'grad_norm': '0.3404', 'learning_rate': '8.638e-05', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 25296896, 'tokens/trainable': 25032028, 'epoch': '4.079'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3088/5680 [8:01:04<5:40:49,  7.89s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3089/5680 [8:01:11<5:40:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6536', 'grad_norm': '0.426', 'learning_rate': '8.633e-05', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25305088, 'tokens/trainable': 25040188, 'epoch': '4.079'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3089/5680 [8:01:11<5:40:15,  7.88s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3090/5680 [8:01:19<5:40:06,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6937', 'grad_norm': '0.3839', 'learning_rate': '8.627e-05', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 25313280, 'tokens/trainable': 25048288, 'epoch': '4.079'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3090/5680 [8:01:19<5:40:06,  7.88s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3091/5680 [8:01:27<5:40:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5695', 'grad_norm': '0.3378', 'learning_rate': '8.622e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 25321472, 'tokens/trainable': 25056422, 'epoch': '4.08'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 3091/5680 [8:01:27<5:40:00,  7.88s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3092/5680 [8:01:35<5:39:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3876', 'grad_norm': '0.3345', 'learning_rate': '8.616e-05', 'ppl': '1.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 25329664, 'tokens/trainable': 25064580, 'epoch': '4.08'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3092/5680 [8:01:35<5:39:22,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3093/5680 [8:01:43<5:39:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6822', 'grad_norm': '0.3962', 'learning_rate': '8.611e-05', 'ppl': '1.978', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 25337856, 'tokens/trainable': 25072716, 'epoch': '4.08'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3093/5680 [8:01:43<5:39:25,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3094/5680 [8:01:51<5:39:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.9208', 'grad_norm': '0.3476', 'learning_rate': '8.605e-05', 'ppl': '2.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25346048, 'tokens/trainable': 25080864, 'epoch': '4.08'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3094/5680 [8:01:51<5:39:14,  7.87s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3095/5680 [8:01:59<5:39:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.441', 'grad_norm': '0.3833', 'learning_rate': '8.6e-05', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 25354240, 'tokens/trainable': 25088990, 'epoch': '4.08'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                       | 3095/5680 [8:01:59<5:39:15,  7.87s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 3096/5680 [8:02:06<5:38:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3672', 'grad_norm': '0.3362', 'learning_rate': '8.594e-05', 'ppl': '1.444', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25362432, 'tokens/trainable': 25097152, 'epoch': '4.08'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 3096/5680 [8:02:06<5:38:57,  7.87s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 3097/5680 [8:02:14<5:39:26,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4299', 'grad_norm': '0.3314', 'learning_rate': '8.589e-05', 'ppl': '1.537', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25370624, 'tokens/trainable': 25105336, 'epoch': '4.081'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 3097/5680 [8:02:14<5:39:26,  7.88s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 3098/5680 [8:02:22<5:39:36,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6235', 'grad_norm': '0.3339', 'learning_rate': '8.583e-05', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25378816, 'tokens/trainable': 25113512, 'epoch': '4.081'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 3098/5680 [8:02:22<5:39:36,  7.89s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3099/5680 [8:02:30<5:39:34,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4523', 'grad_norm': '0.344', 'learning_rate': '8.578e-05', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25387008, 'tokens/trainable': 25121682, 'epoch': '4.081'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3099/5680 [8:02:30<5:39:34,  7.89s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3100/5680 [8:02:38<5:39:32,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4478', 'grad_norm': '0.4367', 'learning_rate': '8.572e-05', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 25395200, 'tokens/trainable': 25129828, 'epoch': '4.081'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3100/5680 [8:02:38<5:39:32,  7.90s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3101/5680 [8:02:46<5:38:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4414', 'grad_norm': '0.3228', 'learning_rate': '8.567e-05', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 25403392, 'tokens/trainable': 25137964, 'epoch': '4.081'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3101/5680 [8:02:46<5:38:37,  7.88s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3102/5680 [8:02:54<5:38:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4773', 'grad_norm': '0.3156', 'learning_rate': '8.561e-05', 'ppl': '1.612', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 25411584, 'tokens/trainable': 25146124, 'epoch': '4.082'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 3102/5680 [8:02:54<5:38:08,  7.87s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3103/5680 [8:03:02<5:41:44,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4325', 'grad_norm': '0.3628', 'learning_rate': '8.556e-05', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.3', 'tokens/total': 25419776, 'tokens/trainable': 25154276, 'epoch': '4.082'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3103/5680 [8:03:02<5:41:44,  7.96s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3104/5680 [8:03:10<5:40:14,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.6529', 'grad_norm': '0.3428', 'learning_rate': '8.55e-05', 'ppl': '1.921', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 25427968, 'tokens/trainable': 25162444, 'epoch': '4.082'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3104/5680 [8:03:10<5:40:14,  7.92s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3105/5680 [8:03:18<5:39:06,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4816', 'grad_norm': '0.3643', 'learning_rate': '8.545e-05', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25436160, 'tokens/trainable': 25170596, 'epoch': '4.082'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3105/5680 [8:03:18<5:39:06,  7.90s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3106/5680 [8:03:26<5:38:56,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4249', 'grad_norm': '0.3168', 'learning_rate': '8.54e-05', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 25444352, 'tokens/trainable': 25178640, 'epoch': '4.082'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 3106/5680 [8:03:26<5:38:56,  7.90s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 3107/5680 [8:03:33<5:38:34,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4028', 'grad_norm': '0.3738', 'learning_rate': '8.534e-05', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 25452544, 'tokens/trainable': 25186756, 'epoch': '4.082'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 3107/5680 [8:03:33<5:38:34,  7.90s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 3108/5680 [8:03:41<5:37:46,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5516', 'grad_norm': '0.3772', 'learning_rate': '8.529e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 25460736, 'tokens/trainable': 25194908, 'epoch': '4.083'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 3108/5680 [8:03:41<5:37:46,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 3109/5680 [8:03:49<5:37:39,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4815', 'grad_norm': '0.365', 'learning_rate': '8.523e-05', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 25468928, 'tokens/trainable': 25203044, 'epoch': '4.083'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 3109/5680 [8:03:49<5:37:39,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3110/5680 [8:03:57<5:37:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4637', 'grad_norm': '0.349', 'learning_rate': '8.518e-05', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25477120, 'tokens/trainable': 25211174, 'epoch': '4.083'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3110/5680 [8:03:57<5:37:04,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3111/5680 [8:04:05<5:37:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4691', 'grad_norm': '0.3714', 'learning_rate': '8.512e-05', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 25485312, 'tokens/trainable': 25219276, 'epoch': '4.083'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3111/5680 [8:04:05<5:37:13,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3112/5680 [8:04:13<5:37:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4893', 'grad_norm': '0.3885', 'learning_rate': '8.507e-05', 'ppl': '1.631', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 25493504, 'tokens/trainable': 25227336, 'epoch': '4.083'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3112/5680 [8:04:13<5:37:00,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3113/5680 [8:04:21<5:36:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6019', 'grad_norm': '0.4238', 'learning_rate': '8.501e-05', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 25501696, 'tokens/trainable': 25235504, 'epoch': '4.083'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 3113/5680 [8:04:21<5:36:50,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3114/5680 [8:04:29<5:37:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4527', 'grad_norm': '0.5059', 'learning_rate': '8.496e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 25509888, 'tokens/trainable': 25243658, 'epoch': '4.084'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3114/5680 [8:04:29<5:37:11,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3115/5680 [8:04:36<5:36:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4247', 'grad_norm': '0.3225', 'learning_rate': '8.49e-05', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25518080, 'tokens/trainable': 25251812, 'epoch': '4.084'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3115/5680 [8:04:36<5:36:34,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3116/5680 [8:04:44<5:36:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5084', 'grad_norm': '0.3574', 'learning_rate': '8.485e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25526272, 'tokens/trainable': 25259954, 'epoch': '4.084'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3116/5680 [8:04:44<5:36:30,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3117/5680 [8:04:52<5:35:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3703', 'grad_norm': '0.3005', 'learning_rate': '8.479e-05', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 25534464, 'tokens/trainable': 25268084, 'epoch': '4.084'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 3117/5680 [8:04:52<5:35:50,  7.86s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3118/5680 [8:05:00<5:35:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4969', 'grad_norm': '0.3274', 'learning_rate': '8.474e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 25542656, 'tokens/trainable': 25276268, 'epoch': '4.084'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3118/5680 [8:05:00<5:35:15,  7.85s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3119/5680 [8:05:08<5:35:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5864', 'grad_norm': '0.3744', 'learning_rate': '8.468e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25550848, 'tokens/trainable': 25284432, 'epoch': '4.085'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3119/5680 [8:05:08<5:35:25,  7.86s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3120/5680 [8:05:16<5:34:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4939', 'grad_norm': '0.3152', 'learning_rate': '8.463e-05', 'ppl': '1.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 25559040, 'tokens/trainable': 25292474, 'epoch': '4.085'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3120/5680 [8:05:16<5:34:53,  7.85s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3121/5680 [8:05:24<5:35:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6086', 'grad_norm': '0.4137', 'learning_rate': '8.458e-05', 'ppl': '1.838', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 25567232, 'tokens/trainable': 25300552, 'epoch': '4.085'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 3121/5680 [8:05:24<5:35:42,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 3122/5680 [8:05:31<5:34:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5145', 'grad_norm': '0.3716', 'learning_rate': '8.452e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25575424, 'tokens/trainable': 25308680, 'epoch': '4.085'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 3122/5680 [8:05:31<5:34:56,  7.86s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 3123/5680 [8:05:39<5:34:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7286', 'grad_norm': '0.4039', 'learning_rate': '8.447e-05', 'ppl': '2.072', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 25583616, 'tokens/trainable': 25316836, 'epoch': '4.085'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 3123/5680 [8:05:39<5:34:20,  7.85s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 3124/5680 [8:05:47<5:34:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5578', 'grad_norm': '0.3684', 'learning_rate': '8.441e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 25591808, 'tokens/trainable': 25324936, 'epoch': '4.085'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 3124/5680 [8:05:47<5:34:50,  7.86s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3125/5680 [8:05:55<5:35:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5017', 'grad_norm': '0.37', 'learning_rate': '8.436e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 25600000, 'tokens/trainable': 25333018, 'epoch': '4.086'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3125/5680 [8:05:55<5:35:21,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3126/5680 [8:06:03<5:35:31,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6118', 'grad_norm': '0.3866', 'learning_rate': '8.43e-05', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 25608192, 'tokens/trainable': 25341158, 'epoch': '4.086'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3126/5680 [8:06:03<5:35:31,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3127/5680 [8:06:11<5:35:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4599', 'grad_norm': '0.4689', 'learning_rate': '8.425e-05', 'ppl': '1.584', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25616384, 'tokens/trainable': 25349336, 'epoch': '4.086'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3127/5680 [8:06:11<5:35:27,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3128/5680 [8:06:19<5:34:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4127', 'grad_norm': '0.3116', 'learning_rate': '8.419e-05', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 25624576, 'tokens/trainable': 25357516, 'epoch': '4.086'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 3128/5680 [8:06:19<5:34:50,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3129/5680 [8:06:27<5:35:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3622', 'grad_norm': '0.351', 'learning_rate': '8.414e-05', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 25632768, 'tokens/trainable': 25365672, 'epoch': '4.086'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3129/5680 [8:06:27<5:35:08,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3130/5680 [8:06:34<5:34:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5561', 'grad_norm': '0.3599', 'learning_rate': '8.408e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 25640960, 'tokens/trainable': 25373848, 'epoch': '4.086'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3130/5680 [8:06:34<5:34:32,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3131/5680 [8:06:42<5:33:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4376', 'grad_norm': '0.2931', 'learning_rate': '8.403e-05', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25649152, 'tokens/trainable': 25381968, 'epoch': '4.087'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3131/5680 [8:06:42<5:33:54,  7.86s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3132/5680 [8:06:50<5:33:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5225', 'grad_norm': '0.3662', 'learning_rate': '8.397e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25657344, 'tokens/trainable': 25390132, 'epoch': '4.087'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3132/5680 [8:06:50<5:33:47,  7.86s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3133/5680 [8:06:58<5:34:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4889', 'grad_norm': '0.3419', 'learning_rate': '8.392e-05', 'ppl': '1.63', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 25665536, 'tokens/trainable': 25398308, 'epoch': '4.087'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3133/5680 [8:06:58<5:34:35,  7.88s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3134/5680 [8:07:06<5:33:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4621', 'grad_norm': '0.3076', 'learning_rate': '8.387e-05', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25673728, 'tokens/trainable': 25406408, 'epoch': '4.087'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3134/5680 [8:07:06<5:33:53,  7.87s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3135/5680 [8:07:14<5:33:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5103', 'grad_norm': '0.4107', 'learning_rate': '8.381e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25681920, 'tokens/trainable': 25414560, 'epoch': '4.087'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3135/5680 [8:07:14<5:33:51,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3136/5680 [8:07:22<5:33:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4496', 'grad_norm': '0.3378', 'learning_rate': '8.376e-05', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 25690112, 'tokens/trainable': 25422732, 'epoch': '4.088'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3136/5680 [8:07:22<5:33:34,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3137/5680 [8:07:29<5:33:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5575', 'grad_norm': '0.3248', 'learning_rate': '8.37e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25698304, 'tokens/trainable': 25430848, 'epoch': '4.088'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3137/5680 [8:07:29<5:33:09,  7.86s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3138/5680 [8:07:37<5:33:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.308', 'grad_norm': '0.2767', 'learning_rate': '8.365e-05', 'ppl': '1.361', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 25706496, 'tokens/trainable': 25438918, 'epoch': '4.088'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3138/5680 [8:07:37<5:33:32,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3139/5680 [8:07:45<5:33:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6523', 'grad_norm': '0.3448', 'learning_rate': '8.359e-05', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 25714688, 'tokens/trainable': 25447000, 'epoch': '4.088'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 3139/5680 [8:07:45<5:33:15,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3140/5680 [8:07:53<5:33:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3332', 'grad_norm': '0.2918', 'learning_rate': '8.354e-05', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25722880, 'tokens/trainable': 25455140, 'epoch': '4.088'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3140/5680 [8:07:53<5:33:10,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3141/5680 [8:08:01<5:32:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.751', 'grad_norm': '0.339', 'learning_rate': '8.348e-05', 'ppl': '2.119', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25731072, 'tokens/trainable': 25463258, 'epoch': '4.088'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3141/5680 [8:08:01<5:32:41,  7.86s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3142/5680 [8:08:09<5:33:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4454', 'grad_norm': '0.3454', 'learning_rate': '8.343e-05', 'ppl': '1.561', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 25739264, 'tokens/trainable': 25471356, 'epoch': '4.089'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3142/5680 [8:08:09<5:33:04,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3143/5680 [8:08:17<5:33:24,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4071', 'grad_norm': '0.3465', 'learning_rate': '8.337e-05', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 25747456, 'tokens/trainable': 25479500, 'epoch': '4.089'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 3143/5680 [8:08:17<5:33:24,  7.89s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 3144/5680 [8:08:25<5:33:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5149', 'grad_norm': '0.3376', 'learning_rate': '8.332e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 25755648, 'tokens/trainable': 25487522, 'epoch': '4.089'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 3144/5680 [8:08:25<5:33:08,  7.88s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 3145/5680 [8:08:32<5:33:08,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3213', 'grad_norm': '0.3358', 'learning_rate': '8.326e-05', 'ppl': '1.379', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 25763840, 'tokens/trainable': 25495576, 'epoch': '4.089'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 3145/5680 [8:08:32<5:33:08,  7.89s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 3146/5680 [8:08:40<5:32:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4157', 'grad_norm': '0.3758', 'learning_rate': '8.321e-05', 'ppl': '1.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 25772032, 'tokens/trainable': 25503606, 'epoch': '4.089'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 3146/5680 [8:08:40<5:32:57,  7.88s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3147/5680 [8:08:48<5:32:45,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.377', 'grad_norm': '0.3257', 'learning_rate': '8.316e-05', 'ppl': '1.458', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 25780224, 'tokens/trainable': 25511784, 'epoch': '4.089'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3147/5680 [8:08:48<5:32:45,  7.88s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3148/5680 [8:08:56<5:32:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5713', 'grad_norm': '0.3619', 'learning_rate': '8.31e-05', 'ppl': '1.771', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 25788416, 'tokens/trainable': 25519824, 'epoch': '4.09'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3148/5680 [8:08:56<5:32:05,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3149/5680 [8:09:04<5:31:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6471', 'grad_norm': '0.3329', 'learning_rate': '8.305e-05', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25796608, 'tokens/trainable': 25527972, 'epoch': '4.09'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3149/5680 [8:09:04<5:31:56,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3150/5680 [8:09:12<5:31:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3924', 'grad_norm': '0.3795', 'learning_rate': '8.299e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25804800, 'tokens/trainable': 25536124, 'epoch': '4.09'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 3150/5680 [8:09:12<5:31:55,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3151/5680 [8:09:20<5:31:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4445', 'grad_norm': '0.3538', 'learning_rate': '8.294e-05', 'ppl': '1.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 25812992, 'tokens/trainable': 25544140, 'epoch': '4.09'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3151/5680 [8:09:20<5:31:37,  7.87s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3152/5680 [8:09:28<5:31:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5463', 'grad_norm': '0.315', 'learning_rate': '8.288e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 25821184, 'tokens/trainable': 25552088, 'epoch': '4.09'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3152/5680 [8:09:28<5:31:31,  7.87s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3153/5680 [8:09:35<5:31:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5081', 'grad_norm': '0.372', 'learning_rate': '8.283e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 25829376, 'tokens/trainable': 25560204, 'epoch': '4.09'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3153/5680 [8:09:35<5:31:03,  7.86s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3154/5680 [8:09:43<5:30:33,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5003', 'grad_norm': '0.3093', 'learning_rate': '8.277e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25837568, 'tokens/trainable': 25568318, 'epoch': '4.091'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 3154/5680 [8:09:43<5:30:33,  7.85s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3155/5680 [8:09:51<5:30:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4188', 'grad_norm': '0.4334', 'learning_rate': '8.272e-05', 'ppl': '1.52', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 25845760, 'tokens/trainable': 25576234, 'epoch': '4.091'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3155/5680 [8:09:51<5:30:27,  7.85s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3156/5680 [8:09:59<5:30:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5849', 'grad_norm': '0.4415', 'learning_rate': '8.267e-05', 'ppl': '1.795', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 25853952, 'tokens/trainable': 25584208, 'epoch': '4.091'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3156/5680 [8:09:59<5:30:12,  7.85s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3157/5680 [8:10:07<5:30:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4427', 'grad_norm': '0.3053', 'learning_rate': '8.261e-05', 'ppl': '1.557', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 25862144, 'tokens/trainable': 25592384, 'epoch': '4.091'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3157/5680 [8:10:07<5:30:35,  7.86s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3158/5680 [8:10:15<5:30:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.492', 'grad_norm': '0.3937', 'learning_rate': '8.256e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 25870336, 'tokens/trainable': 25600432, 'epoch': '4.091'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 3158/5680 [8:10:15<5:30:52,  7.87s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 3159/5680 [8:10:23<5:30:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3952', 'grad_norm': '0.3379', 'learning_rate': '8.25e-05', 'ppl': '1.485', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 25878528, 'tokens/trainable': 25608600, 'epoch': '4.092'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 3159/5680 [8:10:23<5:30:40,  7.87s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 3160/5680 [8:10:30<5:30:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4378', 'grad_norm': '0.3858', 'learning_rate': '8.245e-05', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 25886720, 'tokens/trainable': 25616578, 'epoch': '4.092'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 3160/5680 [8:10:30<5:30:33,  7.87s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 3161/5680 [8:10:38<5:30:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7654', 'grad_norm': '0.3445', 'learning_rate': '8.239e-05', 'ppl': '2.15', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.3', 'tokens/total': 25894912, 'tokens/trainable': 25624368, 'epoch': '4.092'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 3161/5680 [8:10:38<5:30:06,  7.86s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3162/5680 [8:10:46<5:30:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4977', 'grad_norm': '0.3648', 'learning_rate': '8.234e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 25903104, 'tokens/trainable': 25632528, 'epoch': '4.092'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3162/5680 [8:10:46<5:30:00,  7.86s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3163/5680 [8:10:54<5:29:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5024', 'grad_norm': '0.3964', 'learning_rate': '8.228e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 25911296, 'tokens/trainable': 25640504, 'epoch': '4.092'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3163/5680 [8:10:54<5:29:56,  7.86s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3164/5680 [8:11:02<5:29:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6083', 'grad_norm': '0.3508', 'learning_rate': '8.223e-05', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 25919488, 'tokens/trainable': 25648644, 'epoch': '4.092'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3164/5680 [8:11:02<5:29:36,  7.86s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3165/5680 [8:11:10<5:29:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3942', 'grad_norm': '0.3382', 'learning_rate': '8.218e-05', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 25927680, 'tokens/trainable': 25656708, 'epoch': '4.093'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 3165/5680 [8:11:10<5:29:07,  7.85s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3166/5680 [8:11:18<5:29:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5612', 'grad_norm': '0.352', 'learning_rate': '8.212e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.1', 'tokens/total': 25935872, 'tokens/trainable': 25664596, 'epoch': '4.093'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3166/5680 [8:11:18<5:29:33,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3167/5680 [8:11:25<5:29:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6536', 'grad_norm': '0.3547', 'learning_rate': '8.207e-05', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 25944064, 'tokens/trainable': 25672708, 'epoch': '4.093'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3167/5680 [8:11:25<5:29:18,  7.86s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3168/5680 [8:11:33<5:29:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4907', 'grad_norm': '0.3719', 'learning_rate': '8.201e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 25952256, 'tokens/trainable': 25680792, 'epoch': '4.093'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3168/5680 [8:11:33<5:29:13,  7.86s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3169/5680 [8:11:41<5:29:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4411', 'grad_norm': '0.3447', 'learning_rate': '8.196e-05', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 25960448, 'tokens/trainable': 25688898, 'epoch': '4.093'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                     | 3169/5680 [8:11:41<5:29:01,  7.86s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 3170/5680 [8:11:49<5:29:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4385', 'grad_norm': '0.4017', 'learning_rate': '8.19e-05', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 25968640, 'tokens/trainable': 25696962, 'epoch': '4.093'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 3170/5680 [8:11:49<5:29:17,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 3171/5680 [8:11:57<5:29:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3944', 'grad_norm': '0.3238', 'learning_rate': '8.185e-05', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 25976832, 'tokens/trainable': 25704976, 'epoch': '4.094'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 3171/5680 [8:11:57<5:29:06,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 3172/5680 [8:12:05<5:28:58,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5901', 'grad_norm': '0.3911', 'learning_rate': '8.179e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 25985024, 'tokens/trainable': 25713094, 'epoch': '4.094'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 3172/5680 [8:12:05<5:28:58,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3173/5680 [8:12:13<5:28:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.641', 'grad_norm': '0.3685', 'learning_rate': '8.174e-05', 'ppl': '1.898', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 25993216, 'tokens/trainable': 25721232, 'epoch': '4.094'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3173/5680 [8:12:13<5:28:50,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3174/5680 [8:12:21<5:28:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3797', 'grad_norm': '0.3545', 'learning_rate': '8.169e-05', 'ppl': '1.462', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 26001408, 'tokens/trainable': 25729272, 'epoch': '4.094'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3174/5680 [8:12:21<5:28:48,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3175/5680 [8:12:28<5:28:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5382', 'grad_norm': '0.3362', 'learning_rate': '8.163e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.2', 'tokens/total': 26009600, 'tokens/trainable': 25737100, 'epoch': '4.094'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3175/5680 [8:12:28<5:28:25,  7.87s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3176/5680 [8:12:36<5:27:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7105', 'grad_norm': '0.5481', 'learning_rate': '8.158e-05', 'ppl': '2.035', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.9', 'tokens/total': 26017792, 'tokens/trainable': 25744916, 'epoch': '4.095'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 3176/5680 [8:12:36<5:27:48,  7.85s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3177/5680 [8:12:44<5:27:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5527', 'grad_norm': '0.3965', 'learning_rate': '8.152e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 26025984, 'tokens/trainable': 25752852, 'epoch': '4.095'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3177/5680 [8:12:44<5:27:30,  7.85s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3178/5680 [8:12:52<5:27:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.673', 'grad_norm': '0.3717', 'learning_rate': '8.147e-05', 'ppl': '1.96', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 26034176, 'tokens/trainable': 25760928, 'epoch': '4.095'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3178/5680 [8:12:52<5:27:23,  7.85s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3179/5680 [8:13:00<5:26:57,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6284', 'grad_norm': '0.3678', 'learning_rate': '8.141e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 26042368, 'tokens/trainable': 25768904, 'epoch': '4.095'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3179/5680 [8:13:00<5:26:57,  7.84s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3180/5680 [8:13:08<5:28:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.745', 'grad_norm': '0.3388', 'learning_rate': '8.136e-05', 'ppl': '2.106', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26050560, 'tokens/trainable': 25776904, 'epoch': '4.095'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3180/5680 [8:13:08<5:28:18,  7.88s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 3181/5680 [8:13:16<5:29:18,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5772', 'grad_norm': '0.5426', 'learning_rate': '8.131e-05', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.6', 'tokens/total': 26058752, 'tokens/trainable': 25784748, 'epoch': '4.095'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 3181/5680 [8:13:16<5:29:18,  7.91s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 3182/5680 [8:13:24<5:30:47,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.483', 'grad_norm': '0.3888', 'learning_rate': '8.125e-05', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '956.6', 'tokens/total': 26066944, 'tokens/trainable': 25792420, 'epoch': '4.096'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 3182/5680 [8:13:24<5:30:47,  7.95s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 3183/5680 [8:13:32<5:31:33,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4052', 'grad_norm': '0.2804', 'learning_rate': '8.12e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.9', 'tokens/total': 26075136, 'tokens/trainable': 25800156, 'epoch': '4.096'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 3183/5680 [8:13:32<5:31:33,  7.97s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3184/5680 [8:13:40<5:31:44,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4882', 'grad_norm': '0.3656', 'learning_rate': '8.114e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 26083328, 'tokens/trainable': 25808296, 'epoch': '4.096'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3184/5680 [8:13:40<5:31:44,  7.97s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3185/5680 [8:13:48<5:32:14,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7742', 'grad_norm': '0.4063', 'learning_rate': '8.109e-05', 'ppl': '2.169', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.1', 'tokens/total': 26091520, 'tokens/trainable': 25816232, 'epoch': '4.096'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3185/5680 [8:13:48<5:32:14,  7.99s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3186/5680 [8:13:56<5:31:51,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5393', 'grad_norm': '0.3081', 'learning_rate': '8.103e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.6', 'tokens/total': 26099712, 'tokens/trainable': 25824196, 'epoch': '4.096'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3186/5680 [8:13:56<5:31:51,  7.98s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3187/5680 [8:14:04<5:32:25,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4991', 'grad_norm': '0.3084', 'learning_rate': '8.098e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.5', 'tokens/total': 26107904, 'tokens/trainable': 25832054, 'epoch': '4.096'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 3187/5680 [8:14:04<5:32:25,  8.00s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3188/5680 [8:14:12<5:32:21,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.53', 'grad_norm': '0.4258', 'learning_rate': '8.093e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 26116096, 'tokens/trainable': 25840232, 'epoch': '4.097'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3188/5680 [8:14:12<5:32:21,  8.00s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3189/5680 [8:14:20<5:32:21,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5377', 'grad_norm': '0.3099', 'learning_rate': '8.087e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 26124288, 'tokens/trainable': 25848412, 'epoch': '4.097'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3189/5680 [8:14:20<5:32:21,  8.01s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3190/5680 [8:14:28<5:32:43,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5945', 'grad_norm': '0.3986', 'learning_rate': '8.082e-05', 'ppl': '1.812', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990', 'tokens/total': 26132480, 'tokens/trainable': 25856376, 'epoch': '4.097'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3190/5680 [8:14:28<5:32:43,  8.02s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3191/5680 [8:14:36<5:32:44,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6848', 'grad_norm': '0.3811', 'learning_rate': '8.076e-05', 'ppl': '1.983', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.8', 'tokens/total': 26140672, 'tokens/trainable': 25864346, 'epoch': '4.097'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                    | 3191/5680 [8:14:36<5:32:44,  8.02s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 3192/5680 [8:14:44<5:32:01,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.423', 'grad_norm': '0.34', 'learning_rate': '8.071e-05', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.8', 'tokens/total': 26148864, 'tokens/trainable': 25872292, 'epoch': '4.097'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 3192/5680 [8:14:44<5:32:01,  8.01s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 3193/5680 [8:14:52<5:31:18,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4267', 'grad_norm': '0.3145', 'learning_rate': '8.065e-05', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26157056, 'tokens/trainable': 25880292, 'epoch': '4.098'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 3193/5680 [8:14:52<5:31:18,  7.99s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 3194/5680 [8:15:00<5:31:20,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5342', 'grad_norm': '0.3476', 'learning_rate': '8.06e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.6', 'tokens/total': 26165248, 'tokens/trainable': 25888100, 'epoch': '4.098'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 3194/5680 [8:15:00<5:31:20,  8.00s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3195/5680 [8:15:08<5:31:02,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.516', 'grad_norm': '0.3583', 'learning_rate': '8.055e-05', 'ppl': '1.675', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.8', 'tokens/total': 26173440, 'tokens/trainable': 25896024, 'epoch': '4.098'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3195/5680 [8:15:08<5:31:02,  7.99s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3196/5680 [8:15:16<5:31:03,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.612', 'grad_norm': '0.3395', 'learning_rate': '8.049e-05', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 26181632, 'tokens/trainable': 25904132, 'epoch': '4.098'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3196/5680 [8:15:16<5:31:03,  8.00s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3197/5680 [8:15:24<5:31:22,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4368', 'grad_norm': '0.3088', 'learning_rate': '8.044e-05', 'ppl': '1.548', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '955.6', 'tokens/total': 26189824, 'tokens/trainable': 25911806, 'epoch': '4.098'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3197/5680 [8:15:24<5:31:22,  8.01s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3198/5680 [8:15:32<5:31:30,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5139', 'grad_norm': '0.377', 'learning_rate': '8.038e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.1', 'tokens/total': 26198016, 'tokens/trainable': 25919608, 'epoch': '4.098'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 3198/5680 [8:15:32<5:31:30,  8.01s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3199/5680 [8:15:40<5:30:56,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4763', 'grad_norm': '0.3136', 'learning_rate': '8.033e-05', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 26206208, 'tokens/trainable': 25927652, 'epoch': '4.099'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3199/5680 [8:15:40<5:30:56,  8.00s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3200/5680 [8:15:48<5:31:18,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6141', 'grad_norm': '0.4223', 'learning_rate': '8.027e-05', 'ppl': '1.848', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.5', 'tokens/total': 26214400, 'tokens/trainable': 25935602, 'epoch': '4.099'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3200/5680 [8:15:48<5:31:18,  8.02s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3201/5680 [8:15:56<5:30:41,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.924', 'grad_norm': '0.4062', 'learning_rate': '8.022e-05', 'ppl': '2.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985', 'tokens/total': 26222592, 'tokens/trainable': 25943456, 'epoch': '4.099'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3201/5680 [8:15:56<5:30:41,  8.00s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3202/5680 [8:16:04<5:30:44,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.464', 'grad_norm': '0.344', 'learning_rate': '8.017e-05', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 26230784, 'tokens/trainable': 25951616, 'epoch': '4.099'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 3202/5680 [8:16:04<5:30:44,  8.01s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3203/5680 [8:16:13<5:39:01,  8.21s/it]                                                                                                                                                                                                                                             {'loss': '0.5241', 'grad_norm': '0.4088', 'learning_rate': '8.011e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '904.1', 'tokens/total': 26238976, 'tokens/trainable': 25959468, 'epoch': '4.099'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3203/5680 [8:16:13<5:39:01,  8.21s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3204/5680 [8:16:21<5:35:58,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.921', 'grad_norm': '0.3928', 'learning_rate': '8.006e-05', 'ppl': '2.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26247168, 'tokens/trainable': 25967480, 'epoch': '4.099'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3204/5680 [8:16:21<5:35:58,  8.14s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3205/5680 [8:16:29<5:34:16,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.4533', 'grad_norm': '0.3354', 'learning_rate': '8e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.4', 'tokens/total': 26255360, 'tokens/trainable': 25975488, 'epoch': '4.1'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3205/5680 [8:16:29<5:34:16,  8.10s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3206/5680 [8:16:37<5:32:47,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.4962', 'grad_norm': '0.3923', 'learning_rate': '7.995e-05', 'ppl': '1.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '955.2', 'tokens/total': 26263552, 'tokens/trainable': 25983122, 'epoch': '4.1'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 3206/5680 [8:16:37<5:32:47,  8.07s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 3207/5680 [8:16:45<5:31:50,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.429', 'grad_norm': '0.3847', 'learning_rate': '7.989e-05', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.1', 'tokens/total': 26271744, 'tokens/trainable': 25991110, 'epoch': '4.1'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 3207/5680 [8:16:45<5:31:50,  8.05s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 3208/5680 [8:16:53<5:31:04,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.3265', 'grad_norm': '0.2682', 'learning_rate': '7.984e-05', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.4', 'tokens/total': 26279936, 'tokens/trainable': 25998992, 'epoch': '4.1'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 3208/5680 [8:16:53<5:31:04,  8.04s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 3209/5680 [8:17:01<5:30:46,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5736', 'grad_norm': '0.3669', 'learning_rate': '7.979e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 26288128, 'tokens/trainable': 26007036, 'epoch': '4.1'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 3209/5680 [8:17:01<5:30:46,  8.03s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3210/5680 [8:17:09<5:30:45,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.8683', 'grad_norm': '0.4831', 'learning_rate': '7.973e-05', 'ppl': '2.383', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.5', 'tokens/total': 26296320, 'tokens/trainable': 26014984, 'epoch': '4.101'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3210/5680 [8:17:09<5:30:45,  8.03s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3211/5680 [8:17:17<5:29:57,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6165', 'grad_norm': '0.701', 'learning_rate': '7.968e-05', 'ppl': '1.852', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.4', 'tokens/total': 26304512, 'tokens/trainable': 26022916, 'epoch': '4.101'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3211/5680 [8:17:17<5:29:57,  8.02s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3212/5680 [8:17:25<5:29:23,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7435', 'grad_norm': '0.4641', 'learning_rate': '7.962e-05', 'ppl': '2.103', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986', 'tokens/total': 26312704, 'tokens/trainable': 26030772, 'epoch': '4.101'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3212/5680 [8:17:25<5:29:23,  8.01s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3213/5680 [8:17:33<5:29:07,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5987', 'grad_norm': '0.3446', 'learning_rate': '7.957e-05', 'ppl': '1.82', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.3', 'tokens/total': 26320896, 'tokens/trainable': 26038570, 'epoch': '4.101'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 3213/5680 [8:17:33<5:29:07,  8.00s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3214/5680 [8:17:41<5:29:10,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5702', 'grad_norm': '0.3629', 'learning_rate': '7.952e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.9', 'tokens/total': 26329088, 'tokens/trainable': 26046568, 'epoch': '4.101'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3214/5680 [8:17:41<5:29:10,  8.01s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3215/5680 [8:17:49<5:28:46,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5152', 'grad_norm': '0.3563', 'learning_rate': '7.946e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.1', 'tokens/total': 26337280, 'tokens/trainable': 26054466, 'epoch': '4.101'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3215/5680 [8:17:49<5:28:46,  8.00s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3216/5680 [8:17:57<5:28:10,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7015', 'grad_norm': '0.3565', 'learning_rate': '7.941e-05', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.2', 'tokens/total': 26345472, 'tokens/trainable': 26062312, 'epoch': '4.102'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3216/5680 [8:17:57<5:28:10,  7.99s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3217/5680 [8:18:04<5:27:40,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5556', 'grad_norm': '0.4165', 'learning_rate': '7.935e-05', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 26353664, 'tokens/trainable': 26070388, 'epoch': '4.102'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 3217/5680 [8:18:04<5:27:40,  7.98s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 3218/5680 [8:18:12<5:27:37,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.403', 'grad_norm': '0.3345', 'learning_rate': '7.93e-05', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.8', 'tokens/total': 26361856, 'tokens/trainable': 26078262, 'epoch': '4.102'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 3218/5680 [8:18:12<5:27:37,  7.98s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 3219/5680 [8:18:21<5:28:01,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5518', 'grad_norm': '0.3455', 'learning_rate': '7.924e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 26370048, 'tokens/trainable': 26086448, 'epoch': '4.102'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 3219/5680 [8:18:21<5:28:01,  8.00s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 3220/5680 [8:18:29<5:27:56,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5482', 'grad_norm': '0.3755', 'learning_rate': '7.919e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 26378240, 'tokens/trainable': 26094578, 'epoch': '4.102'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 3220/5680 [8:18:29<5:27:56,  8.00s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3221/5680 [8:18:36<5:27:12,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4569', 'grad_norm': '0.3248', 'learning_rate': '7.914e-05', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.9', 'tokens/total': 26386432, 'tokens/trainable': 26102500, 'epoch': '4.102'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3221/5680 [8:18:36<5:27:12,  7.98s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3222/5680 [8:18:44<5:26:59,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4247', 'grad_norm': '0.3054', 'learning_rate': '7.908e-05', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 26394624, 'tokens/trainable': 26110484, 'epoch': '4.103'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3222/5680 [8:18:44<5:26:59,  7.98s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3223/5680 [8:18:52<5:26:54,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.571', 'grad_norm': '0.4113', 'learning_rate': '7.903e-05', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 26402816, 'tokens/trainable': 26118532, 'epoch': '4.103'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3223/5680 [8:18:52<5:26:54,  7.98s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3224/5680 [8:19:00<5:27:16,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6287', 'grad_norm': '0.385', 'learning_rate': '7.897e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 26411008, 'tokens/trainable': 26126608, 'epoch': '4.103'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 3224/5680 [8:19:00<5:27:16,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3225/5680 [8:19:08<5:27:10,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.7355', 'grad_norm': '0.4393', 'learning_rate': '7.892e-05', 'ppl': '2.087', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 26419200, 'tokens/trainable': 26134680, 'epoch': '4.103'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3225/5680 [8:19:08<5:27:10,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3226/5680 [8:19:16<5:26:55,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4549', 'grad_norm': '0.3498', 'learning_rate': '7.887e-05', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26427392, 'tokens/trainable': 26142708, 'epoch': '4.103'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3226/5680 [8:19:16<5:26:55,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3227/5680 [8:19:24<5:26:27,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4392', 'grad_norm': '0.3888', 'learning_rate': '7.881e-05', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979', 'tokens/total': 26435584, 'tokens/trainable': 26150504, 'epoch': '4.104'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3227/5680 [8:19:24<5:26:27,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3228/5680 [8:19:32<5:26:05,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8254', 'grad_norm': '0.4293', 'learning_rate': '7.876e-05', 'ppl': '2.283', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '962.4', 'tokens/total': 26443776, 'tokens/trainable': 26158168, 'epoch': '4.104'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 3228/5680 [8:19:32<5:26:05,  7.98s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 3229/5680 [8:19:40<5:26:27,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4871', 'grad_norm': '0.371', 'learning_rate': '7.87e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.7', 'tokens/total': 26451968, 'tokens/trainable': 26166176, 'epoch': '4.104'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 3229/5680 [8:19:40<5:26:27,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 3230/5680 [8:19:48<5:26:02,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4182', 'grad_norm': '0.331', 'learning_rate': '7.865e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26460160, 'tokens/trainable': 26174186, 'epoch': '4.104'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 3230/5680 [8:19:48<5:26:02,  7.98s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 3231/5680 [8:19:56<5:26:11,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4655', 'grad_norm': '0.3496', 'learning_rate': '7.86e-05', 'ppl': '1.593', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 26468352, 'tokens/trainable': 26182366, 'epoch': '4.104'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 3231/5680 [8:19:56<5:26:11,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3232/5680 [8:20:04<5:25:55,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4079', 'grad_norm': '0.3758', 'learning_rate': '7.854e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '950.8', 'tokens/total': 26476544, 'tokens/trainable': 26189952, 'epoch': '4.104'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3232/5680 [8:20:04<5:25:55,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3233/5680 [8:20:12<5:25:35,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5639', 'grad_norm': '0.4178', 'learning_rate': '7.849e-05', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.3', 'tokens/total': 26484736, 'tokens/trainable': 26197788, 'epoch': '4.105'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3233/5680 [8:20:12<5:25:35,  7.98s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3234/5680 [8:20:20<5:26:19,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.39', 'grad_norm': '0.3655', 'learning_rate': '7.843e-05', 'ppl': '1.477', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '966.5', 'tokens/total': 26492928, 'tokens/trainable': 26205570, 'epoch': '4.105'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3234/5680 [8:20:20<5:26:19,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3235/5680 [8:20:28<5:26:21,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5561', 'grad_norm': '0.3888', 'learning_rate': '7.838e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 26501120, 'tokens/trainable': 26213606, 'epoch': '4.105'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 3235/5680 [8:20:28<5:26:21,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3236/5680 [8:20:36<5:26:19,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4866', 'grad_norm': '0.4513', 'learning_rate': '7.833e-05', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '933.3', 'tokens/total': 26509312, 'tokens/trainable': 26221086, 'epoch': '4.105'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3236/5680 [8:20:36<5:26:19,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3237/5680 [8:20:44<5:26:15,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4587', 'grad_norm': '0.3225', 'learning_rate': '7.827e-05', 'ppl': '1.582', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.8', 'tokens/total': 26517504, 'tokens/trainable': 26228932, 'epoch': '4.105'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3237/5680 [8:20:44<5:26:15,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3238/5680 [8:20:52<5:26:18,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4138', 'grad_norm': '0.3879', 'learning_rate': '7.822e-05', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.5', 'tokens/total': 26525696, 'tokens/trainable': 26236944, 'epoch': '4.105'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3238/5680 [8:20:52<5:26:18,  8.02s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3239/5680 [8:21:00<5:25:59,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4064', 'grad_norm': '0.2909', 'learning_rate': '7.816e-05', 'ppl': '1.501', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.8', 'tokens/total': 26533888, 'tokens/trainable': 26244806, 'epoch': '4.106'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 3239/5680 [8:21:00<5:25:59,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3240/5680 [8:21:08<5:25:09,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6608', 'grad_norm': '0.3859', 'learning_rate': '7.811e-05', 'ppl': '1.936', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.3', 'tokens/total': 26542080, 'tokens/trainable': 26252666, 'epoch': '4.106'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3240/5680 [8:21:08<5:25:09,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3241/5680 [8:21:16<5:24:55,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4805', 'grad_norm': '0.4148', 'learning_rate': '7.806e-05', 'ppl': '1.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.3', 'tokens/total': 26550272, 'tokens/trainable': 26260534, 'epoch': '4.106'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3241/5680 [8:21:16<5:24:55,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3242/5680 [8:21:24<5:25:03,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4806', 'grad_norm': '0.3963', 'learning_rate': '7.8e-05', 'ppl': '1.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '935.7', 'tokens/total': 26558464, 'tokens/trainable': 26268032, 'epoch': '4.106'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3242/5680 [8:21:24<5:25:03,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3243/5680 [8:21:32<5:25:14,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4134', 'grad_norm': '0.3937', 'learning_rate': '7.795e-05', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.4', 'tokens/total': 26566656, 'tokens/trainable': 26275770, 'epoch': '4.106'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 3243/5680 [8:21:32<5:25:14,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 3244/5680 [8:21:40<5:25:15,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4918', 'grad_norm': '0.4494', 'learning_rate': '7.789e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980.7', 'tokens/total': 26574848, 'tokens/trainable': 26283634, 'epoch': '4.107'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 3244/5680 [8:21:40<5:25:15,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 3245/5680 [8:21:48<5:25:08,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6169', 'grad_norm': '0.3958', 'learning_rate': '7.784e-05', 'ppl': '1.853', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.6', 'tokens/total': 26583040, 'tokens/trainable': 26291584, 'epoch': '4.107'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 3245/5680 [8:21:48<5:25:08,  8.01s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 3246/5680 [8:21:56<5:24:35,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6623', 'grad_norm': '0.3481', 'learning_rate': '7.779e-05', 'ppl': '1.939', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '967.2', 'tokens/total': 26591232, 'tokens/trainable': 26299298, 'epoch': '4.107'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 3246/5680 [8:21:56<5:24:35,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3247/5680 [8:22:04<5:23:42,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5661', 'grad_norm': '0.3379', 'learning_rate': '7.773e-05', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.2', 'tokens/total': 26599424, 'tokens/trainable': 26307182, 'epoch': '4.107'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3247/5680 [8:22:04<5:23:42,  7.98s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3248/5680 [8:22:12<5:23:57,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5296', 'grad_norm': '0.4455', 'learning_rate': '7.768e-05', 'ppl': '1.698', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.7', 'tokens/total': 26607616, 'tokens/trainable': 26315176, 'epoch': '4.107'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3248/5680 [8:22:12<5:23:57,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3249/5680 [8:22:20<5:23:40,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6071', 'grad_norm': '0.3712', 'learning_rate': '7.762e-05', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.2', 'tokens/total': 26615808, 'tokens/trainable': 26323052, 'epoch': '4.107'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3249/5680 [8:22:20<5:23:40,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3250/5680 [8:22:28<5:22:50,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.632', 'grad_norm': '0.4367', 'learning_rate': '7.757e-05', 'ppl': '1.881', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 26624000, 'tokens/trainable': 26331080, 'epoch': '4.108'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 3250/5680 [8:22:28<5:22:50,  7.97s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3251/5680 [8:22:36<5:23:02,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7768', 'grad_norm': '0.53', 'learning_rate': '7.752e-05', 'ppl': '2.174', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 26632192, 'tokens/trainable': 26339184, 'epoch': '4.108'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3251/5680 [8:22:36<5:23:02,  7.98s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3252/5680 [8:22:44<5:23:11,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6577', 'grad_norm': '0.5231', 'learning_rate': '7.746e-05', 'ppl': '1.93', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 26640384, 'tokens/trainable': 26347348, 'epoch': '4.108'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3252/5680 [8:22:44<5:23:11,  7.99s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3253/5680 [8:22:52<5:23:40,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5106', 'grad_norm': '0.3761', 'learning_rate': '7.741e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '959.8', 'tokens/total': 26648576, 'tokens/trainable': 26355060, 'epoch': '4.108'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3253/5680 [8:22:52<5:23:40,  8.00s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3254/5680 [8:23:00<5:22:58,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.445', 'grad_norm': '0.3018', 'learning_rate': '7.736e-05', 'ppl': '1.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980.6', 'tokens/total': 26656768, 'tokens/trainable': 26362860, 'epoch': '4.108'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                  | 3254/5680 [8:23:00<5:22:58,  7.99s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 3255/5680 [8:23:08<5:23:04,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4086', 'grad_norm': '0.3077', 'learning_rate': '7.73e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.1', 'tokens/total': 26664960, 'tokens/trainable': 26370778, 'epoch': '4.108'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 3255/5680 [8:23:08<5:23:04,  7.99s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 3256/5680 [8:23:16<5:23:13,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6031', 'grad_norm': '0.4071', 'learning_rate': '7.725e-05', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983', 'tokens/total': 26673152, 'tokens/trainable': 26378656, 'epoch': '4.109'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 3256/5680 [8:23:16<5:23:13,  8.00s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 3257/5680 [8:23:24<5:23:19,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4534', 'grad_norm': '0.4039', 'learning_rate': '7.719e-05', 'ppl': '1.574', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '946.4', 'tokens/total': 26681344, 'tokens/trainable': 26386244, 'epoch': '4.109'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 3257/5680 [8:23:24<5:23:19,  8.01s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3258/5680 [8:23:32<5:23:24,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4578', 'grad_norm': '0.2927', 'learning_rate': '7.714e-05', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '963.3', 'tokens/total': 26689536, 'tokens/trainable': 26393972, 'epoch': '4.109'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3258/5680 [8:23:32<5:23:24,  8.01s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3259/5680 [8:23:40<5:22:28,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.2785', 'grad_norm': '0.303', 'learning_rate': '7.709e-05', 'ppl': '1.321', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 26697728, 'tokens/trainable': 26401996, 'epoch': '4.109'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3259/5680 [8:23:40<5:22:28,  7.99s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3260/5680 [8:23:48<5:22:11,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5814', 'grad_norm': '0.3276', 'learning_rate': '7.703e-05', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.7', 'tokens/total': 26705920, 'tokens/trainable': 26409796, 'epoch': '4.109'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3260/5680 [8:23:48<5:22:11,  7.99s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3261/5680 [8:23:56<5:21:40,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6598', 'grad_norm': '0.4172', 'learning_rate': '7.698e-05', 'ppl': '1.934', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '952.8', 'tokens/total': 26714112, 'tokens/trainable': 26417374, 'epoch': '4.11'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 3261/5680 [8:23:56<5:21:40,  7.98s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3262/5680 [8:24:04<5:21:33,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4085', 'grad_norm': '0.3431', 'learning_rate': '7.692e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.7', 'tokens/total': 26722304, 'tokens/trainable': 26425150, 'epoch': '4.11'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3262/5680 [8:24:04<5:21:33,  7.98s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3263/5680 [8:24:12<5:21:32,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4543', 'grad_norm': '0.3187', 'learning_rate': '7.687e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 26730496, 'tokens/trainable': 26433308, 'epoch': '4.11'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3263/5680 [8:24:12<5:21:32,  7.98s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3264/5680 [8:24:20<5:21:37,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4254', 'grad_norm': '0.3938', 'learning_rate': '7.682e-05', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 26738688, 'tokens/trainable': 26441364, 'epoch': '4.11'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3264/5680 [8:24:20<5:21:37,  7.99s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3265/5680 [8:24:28<5:21:30,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.2962', 'grad_norm': '0.3018', 'learning_rate': '7.676e-05', 'ppl': '1.345', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.6', 'tokens/total': 26746880, 'tokens/trainable': 26449284, 'epoch': '4.11'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 3265/5680 [8:24:28<5:21:30,  7.99s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 3266/5680 [8:24:36<5:21:47,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4011', 'grad_norm': '0.3663', 'learning_rate': '7.671e-05', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 26755072, 'tokens/trainable': 26457432, 'epoch': '4.11'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 3266/5680 [8:24:36<5:21:47,  8.00s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 3267/5680 [8:24:44<5:21:37,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6562', 'grad_norm': '0.4063', 'learning_rate': '7.666e-05', 'ppl': '1.927', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.4', 'tokens/total': 26763264, 'tokens/trainable': 26465140, 'epoch': '4.111'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 3267/5680 [8:24:44<5:21:37,  8.00s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 3268/5680 [8:24:52<5:20:48,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5779', 'grad_norm': '0.3891', 'learning_rate': '7.66e-05', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.7', 'tokens/total': 26771456, 'tokens/trainable': 26472854, 'epoch': '4.111'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 3268/5680 [8:24:52<5:20:48,  7.98s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3269/5680 [8:25:00<5:21:04,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4115', 'grad_norm': '0.3992', 'learning_rate': '7.655e-05', 'ppl': '1.509', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26779648, 'tokens/trainable': 26480904, 'epoch': '4.111'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3269/5680 [8:25:00<5:21:04,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3270/5680 [8:25:08<5:20:43,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6171', 'grad_norm': '0.3436', 'learning_rate': '7.649e-05', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 26787840, 'tokens/trainable': 26488976, 'epoch': '4.111'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3270/5680 [8:25:08<5:20:43,  7.98s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3271/5680 [8:25:16<5:20:59,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.498', 'grad_norm': '0.3878', 'learning_rate': '7.644e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.2', 'tokens/total': 26796032, 'tokens/trainable': 26496986, 'epoch': '4.111'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3271/5680 [8:25:16<5:20:59,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3272/5680 [8:25:24<5:20:46,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3964', 'grad_norm': '0.3173', 'learning_rate': '7.639e-05', 'ppl': '1.487', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '967.6', 'tokens/total': 26804224, 'tokens/trainable': 26504712, 'epoch': '4.111'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 3272/5680 [8:25:24<5:20:46,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3273/5680 [8:25:32<5:20:31,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6557', 'grad_norm': '0.3646', 'learning_rate': '7.633e-05', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 26812416, 'tokens/trainable': 26512732, 'epoch': '4.112'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3273/5680 [8:25:32<5:20:31,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3274/5680 [8:25:40<5:20:08,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4295', 'grad_norm': '0.3403', 'learning_rate': '7.628e-05', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984', 'tokens/total': 26820608, 'tokens/trainable': 26520572, 'epoch': '4.112'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3274/5680 [8:25:40<5:20:08,  7.98s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3275/5680 [8:25:48<5:20:10,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.9781', 'grad_norm': '0.3641', 'learning_rate': '7.623e-05', 'ppl': '2.659', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.4', 'tokens/total': 26828800, 'tokens/trainable': 26528386, 'epoch': '4.112'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3275/5680 [8:25:48<5:20:10,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3276/5680 [8:25:56<5:20:03,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4287', 'grad_norm': '0.4084', 'learning_rate': '7.617e-05', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.1', 'tokens/total': 26836992, 'tokens/trainable': 26536142, 'epoch': '4.112'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 3276/5680 [8:25:56<5:20:03,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3277/5680 [8:26:04<5:20:09,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3669', 'grad_norm': '0.4032', 'learning_rate': '7.612e-05', 'ppl': '1.443', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.6', 'tokens/total': 26845184, 'tokens/trainable': 26544008, 'epoch': '4.112'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3277/5680 [8:26:04<5:20:09,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3278/5680 [8:26:12<5:20:17,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5041', 'grad_norm': '0.3746', 'learning_rate': '7.606e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 26853376, 'tokens/trainable': 26552182, 'epoch': '4.112'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3278/5680 [8:26:12<5:20:17,  8.00s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3279/5680 [8:26:20<5:19:43,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4097', 'grad_norm': '0.3282', 'learning_rate': '7.601e-05', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 26861568, 'tokens/trainable': 26560216, 'epoch': '4.113'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3279/5680 [8:26:20<5:19:43,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3280/5680 [8:26:28<5:19:36,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4751', 'grad_norm': '0.3064', 'learning_rate': '7.596e-05', 'ppl': '1.608', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 26869760, 'tokens/trainable': 26568280, 'epoch': '4.113'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3280/5680 [8:26:28<5:19:36,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 3281/5680 [8:26:36<5:19:38,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5179', 'grad_norm': '0.3524', 'learning_rate': '7.59e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '965.7', 'tokens/total': 26877952, 'tokens/trainable': 26576008, 'epoch': '4.113'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 3281/5680 [8:26:36<5:19:38,  7.99s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 3282/5680 [8:26:44<5:19:36,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3395', 'grad_norm': '0.4553', 'learning_rate': '7.585e-05', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.7', 'tokens/total': 26886144, 'tokens/trainable': 26583814, 'epoch': '4.113'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 3282/5680 [8:26:44<5:19:36,  8.00s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 3283/5680 [8:26:52<5:19:51,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5572', 'grad_norm': '0.3614', 'learning_rate': '7.58e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '955.2', 'tokens/total': 26894336, 'tokens/trainable': 26591480, 'epoch': '4.113'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 3283/5680 [8:26:52<5:19:51,  8.01s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3284/5680 [8:27:00<5:19:17,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6307', 'grad_norm': '0.3661', 'learning_rate': '7.574e-05', 'ppl': '1.879', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 26902528, 'tokens/trainable': 26599496, 'epoch': '4.114'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3284/5680 [8:27:00<5:19:17,  8.00s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3285/5680 [8:27:08<5:18:28,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4133', 'grad_norm': '0.3131', 'learning_rate': '7.569e-05', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.3', 'tokens/total': 26910720, 'tokens/trainable': 26607330, 'epoch': '4.114'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3285/5680 [8:27:08<5:18:28,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3286/5680 [8:27:16<5:18:16,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5156', 'grad_norm': '0.3823', 'learning_rate': '7.563e-05', 'ppl': '1.675', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.5', 'tokens/total': 26918912, 'tokens/trainable': 26615130, 'epoch': '4.114'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3286/5680 [8:27:16<5:18:16,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3287/5680 [8:27:24<5:18:26,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3798', 'grad_norm': '0.2853', 'learning_rate': '7.558e-05', 'ppl': '1.462', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.4', 'tokens/total': 26927104, 'tokens/trainable': 26622996, 'epoch': '4.114'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 3287/5680 [8:27:24<5:18:26,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3288/5680 [8:27:32<5:18:04,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4042', 'grad_norm': '0.3466', 'learning_rate': '7.553e-05', 'ppl': '1.498', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 26935296, 'tokens/trainable': 26631104, 'epoch': '4.114'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3288/5680 [8:27:32<5:18:04,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3289/5680 [8:27:40<5:17:56,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.659', 'grad_norm': '0.4488', 'learning_rate': '7.547e-05', 'ppl': '1.933', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '973.6', 'tokens/total': 26943488, 'tokens/trainable': 26638870, 'epoch': '4.114'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3289/5680 [8:27:40<5:17:56,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3290/5680 [8:27:48<5:17:55,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3887', 'grad_norm': '0.3701', 'learning_rate': '7.542e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 26951680, 'tokens/trainable': 26646936, 'epoch': '4.115'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3290/5680 [8:27:48<5:17:55,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3291/5680 [8:27:56<5:17:53,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3819', 'grad_norm': '0.3098', 'learning_rate': '7.537e-05', 'ppl': '1.465', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.5', 'tokens/total': 26959872, 'tokens/trainable': 26654856, 'epoch': '4.115'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3291/5680 [8:27:56<5:17:53,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 3292/5680 [8:28:04<5:17:36,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7064', 'grad_norm': '0.4707', 'learning_rate': '7.531e-05', 'ppl': '2.027', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '953.2', 'tokens/total': 26968064, 'tokens/trainable': 26662452, 'epoch': '4.115'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 3292/5680 [8:28:04<5:17:36,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 3293/5680 [8:28:12<5:17:47,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4137', 'grad_norm': '0.2918', 'learning_rate': '7.526e-05', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 26976256, 'tokens/trainable': 26670516, 'epoch': '4.115'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 3293/5680 [8:28:12<5:17:47,  7.99s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 3294/5680 [8:28:20<5:17:16,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7087', 'grad_norm': '0.3892', 'learning_rate': '7.521e-05', 'ppl': '2.031', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 26984448, 'tokens/trainable': 26678500, 'epoch': '4.115'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 3294/5680 [8:28:20<5:17:16,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3295/5680 [8:28:28<5:17:07,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.657', 'grad_norm': '0.356', 'learning_rate': '7.515e-05', 'ppl': '1.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 26992640, 'tokens/trainable': 26686628, 'epoch': '4.115'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3295/5680 [8:28:28<5:17:07,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3296/5680 [8:28:36<5:17:42,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.464', 'grad_norm': '0.4249', 'learning_rate': '7.51e-05', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '925.7', 'tokens/total': 27000832, 'tokens/trainable': 26694060, 'epoch': '4.116'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3296/5680 [8:28:36<5:17:42,  8.00s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3297/5680 [8:28:44<5:16:54,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4416', 'grad_norm': '0.3038', 'learning_rate': '7.505e-05', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 27009024, 'tokens/trainable': 26702224, 'epoch': '4.116'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3297/5680 [8:28:44<5:16:54,  7.98s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3298/5680 [8:28:52<5:17:06,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.8455', 'grad_norm': '0.5092', 'learning_rate': '7.499e-05', 'ppl': '2.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '935', 'tokens/total': 27017216, 'tokens/trainable': 26709708, 'epoch': '4.116'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 3298/5680 [8:28:52<5:17:06,  7.99s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 3299/5680 [8:29:00<5:17:42,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3088', 'grad_norm': '0.3789', 'learning_rate': '7.494e-05', 'ppl': '1.362', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '757.2', 'tokens/total': 27025408, 'tokens/trainable': 26715800, 'epoch': '4.116'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 3299/5680 [8:29:00<5:17:42,  8.01s/it][2026-01-27 06:18:14,052] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:60938] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 06:18:15,390] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:60938] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<04:13, 21.94 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:05<01:56, 46.78 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:05<01:10, 75.68 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:47, 109.55 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:06<00:36, 142.77 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:06<00:28, 174.45 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:07<00:24, 203.13 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:07<00:22, 215.71 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:07<00:20, 229.78 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:08<00:18, 243.87 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:08<00:17, 264.24 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:08<00:16, 268.57 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:09<00:15, 272.97 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:09<00:15, 279.06 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:10<00:15, 262.93 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:10<00:13, 302.03 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:10<00:11, 336.56 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:10<00:12, 295.55 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:11<00:12, 304.30 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:11<00:10, 356.45 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:11<00:11, 293.03 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:12<00:11, 297.92 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:12<00:10, 306.79 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:13<00:11, 278.48 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:13<00:10, 303.82 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:13<00:08, 351.90 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:14<00:09, 307.41 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:14<00:09, 294.73 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:14<00:08, 318.43 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:14<00:07, 320.10 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:15<00:07, 305.89 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:15<00:06, 359.70 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:15<00:06, 336.07 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:16<00:07, 290.93 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:16<00:06, 294.22 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:16<00:05, 334.16 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:17<00:06, 286.18 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:17<00:05, 296.90 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:18<00:05, 289.01 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:18<00:04, 309.23 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:18<00:04, 311.49 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:19<00:04, 305.17 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:19<00:03, 346.51 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:19<00:03, 291.52 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:20<00:03, 298.74 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:20<00:02, 307.87 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:20<00:02, 312.77 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:21<00:02, 309.19 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:21<00:01, 329.16 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:21<00:01, 308.98 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:22<00:00, 316.69 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:22<00:00, 325.96 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:22<00:00, 338.24 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:23<00:00, 325.96 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:23<00:00, 240.49 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 861.30 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:03, 1145.62 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:02, 1200.40 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:03<00:01, 1261.95 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:04<00:00, 1301.04 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1405.68 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1271.84 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1186.43 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1807.68 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2192.72 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2526.76 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2684.59 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2276.31 examples/s]
[2026-01-27 06:18:53,291] [WARNING] [py.warnings._showwarnmsg:109] [PID:60938] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3300/5680 [8:29:47<13:07:26, 19.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3737', 'grad_norm': '0.5426', 'learning_rate': '7.488e-05', 'ppl': '1.453', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '113.5', 'tokens/total': 27033600, 'tokens/trainable': 26721160, 'epoch': '4.116'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3300/5680 [8:29:47<13:07:26, 19.85s/it][2026-01-27 06:19:01,739] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:61177] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 06:19:03,580] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:61177] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:08<07:12, 12.87 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:08<03:11, 28.52 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:09<01:55, 46.35 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:09<01:20, 65.09 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▌                                                                                                                                             | 530/5677 [00:10<00:58, 87.38 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:10<00:44, 112.52 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:11<00:40, 123.26 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:11<00:32, 150.34 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:12<00:29, 159.94 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:13<00:28, 163.17 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:13<00:25, 173.96 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:14<00:24, 178.43 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:14<00:22, 193.12 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:15<00:21, 195.94 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:15<00:20, 196.07 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:16<00:19, 199.75 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:16<00:18, 212.26 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:17<00:18, 203.30 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:17<00:18, 203.89 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:18<00:18, 188.40 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:18<00:17, 199.81 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:19<00:16, 203.87 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:19<00:16, 203.22 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:20<00:15, 200.53 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:20<00:16, 189.40 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:21<00:13, 211.73 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:21<00:13, 204.25 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:22<00:13, 207.24 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:23<00:13, 193.66 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:23<00:12, 194.45 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:24<00:11, 202.22 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:24<00:10, 216.92 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:25<00:11, 196.97 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:25<00:09, 212.77 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:26<00:09, 201.10 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:26<00:08, 218.08 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:27<00:09, 197.32 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:27<00:08, 200.85 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:28<00:07, 203.80 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:28<00:07, 195.08 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:29<00:07, 194.23 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:29<00:06, 199.98 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:30<00:05, 200.35 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:30<00:05, 198.31 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:31<00:05, 184.27 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:31<00:03, 211.96 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:32<00:03, 207.54 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:32<00:03, 191.07 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:33<00:02, 205.68 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:33<00:02, 194.69 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:34<00:01, 211.69 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:34<00:01, 203.23 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:35<00:00, 206.95 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:35<00:00, 208.89 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:36<00:00, 153.46 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 888.22 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:03, 1109.45 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:02, 1209.16 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:03<00:01, 1264.38 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:04<00:00, 1284.76 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1330.95 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1240.91 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1102.61 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1635.19 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:01, 1934.84 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:02<00:00, 2197.33 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2301.16 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 1985.14 examples/s]
[2026-01-27 06:19:48,365] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:61177] Using single process for pack_parallel, running sequentially.
[2026-01-27 06:19:54,747] [WARNING] [py.warnings._showwarnmsg:109] [PID:61177] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3301/5680 [8:30:49<21:22:03, 32.33s/it]                                                                                                                                                                                                                                             {'loss': '0.3021', 'grad_norm': '0.2924', 'learning_rate': '7.483e-05', 'ppl': '1.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 27041792, 'tokens/trainable': 26729296, 'epoch': '5'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3301/5680 [8:30:49<21:22:03, 32.33s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3302/5680 [8:30:57<16:31:52, 25.03s/it]                                                                                                                                                                                                                                             {'loss': '0.7089', 'grad_norm': '0.3394', 'learning_rate': '7.478e-05', 'ppl': '2.032', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 27049984, 'tokens/trainable': 26737486, 'epoch': '5'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3302/5680 [8:30:57<16:31:52, 25.03s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3303/5680 [8:31:05<13:08:34, 19.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3985', 'grad_norm': '0.3986', 'learning_rate': '7.472e-05', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 27058176, 'tokens/trainable': 26745652, 'epoch': '5.001'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3303/5680 [8:31:05<13:08:34, 19.91s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3304/5680 [8:31:13<10:51:16, 16.45s/it]                                                                                                                                                                                                                                             {'loss': '0.4361', 'grad_norm': '0.3518', 'learning_rate': '7.467e-05', 'ppl': '1.547', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '973.3', 'tokens/total': 27066368, 'tokens/trainable': 26753802, 'epoch': '5.001'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3304/5680 [8:31:13<10:51:16, 16.45s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 3305/5680 [8:31:21<9:10:00, 13.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4944', 'grad_norm': '0.3588', 'learning_rate': '7.462e-05', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 27074560, 'tokens/trainable': 26761964, 'epoch': '5.001'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 3305/5680 [8:31:21<9:10:00, 13.90s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3306/5680 [8:31:29<7:59:18, 12.11s/it]                                                                                                                                                                                                                                             {'loss': '0.6137', 'grad_norm': '0.3465', 'learning_rate': '7.456e-05', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27082752, 'tokens/trainable': 26770122, 'epoch': '5.001'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3306/5680 [8:31:29<7:59:18, 12.11s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3307/5680 [8:31:37<7:10:18, 10.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6767', 'grad_norm': '0.375', 'learning_rate': '7.451e-05', 'ppl': '1.967', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 27090944, 'tokens/trainable': 26778284, 'epoch': '5.001'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3307/5680 [8:31:37<7:10:18, 10.88s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3308/5680 [8:31:45<6:39:13, 10.10s/it]                                                                                                                                                                                                                                             {'loss': '0.5975', 'grad_norm': '0.3739', 'learning_rate': '7.446e-05', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.6', 'tokens/total': 27099136, 'tokens/trainable': 26786468, 'epoch': '5.001'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3308/5680 [8:31:45<6:39:13, 10.10s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3309/5680 [8:31:53<6:14:20,  9.47s/it]                                                                                                                                                                                                                                             {'loss': '0.6092', 'grad_norm': '0.3648', 'learning_rate': '7.44e-05', 'ppl': '1.839', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 27107328, 'tokens/trainable': 26794624, 'epoch': '5.002'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 3309/5680 [8:31:53<6:14:20,  9.47s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3310/5680 [8:32:01<5:56:47,  9.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4813', 'grad_norm': '0.3841', 'learning_rate': '7.435e-05', 'ppl': '1.618', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27115520, 'tokens/trainable': 26802808, 'epoch': '5.002'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3310/5680 [8:32:01<5:56:47,  9.03s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3311/5680 [8:32:09<5:44:12,  8.72s/it]                                                                                                                                                                                                                                             {'loss': '0.4279', 'grad_norm': '0.3302', 'learning_rate': '7.43e-05', 'ppl': '1.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27123712, 'tokens/trainable': 26810992, 'epoch': '5.002'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3311/5680 [8:32:09<5:44:12,  8.72s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3312/5680 [8:32:17<5:35:56,  8.51s/it]                                                                                                                                                                                                                                             {'loss': '0.3746', 'grad_norm': '0.337', 'learning_rate': '7.424e-05', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 27131904, 'tokens/trainable': 26819124, 'epoch': '5.002'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3312/5680 [8:32:17<5:35:56,  8.51s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3313/5680 [8:32:25<5:30:38,  8.38s/it]                                                                                                                                                                                                                                             {'loss': '0.276', 'grad_norm': '0.379', 'learning_rate': '7.419e-05', 'ppl': '1.318', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 27140096, 'tokens/trainable': 26827268, 'epoch': '5.002'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 3313/5680 [8:32:25<5:30:38,  8.38s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3314/5680 [8:32:33<5:26:47,  8.29s/it]                                                                                                                                                                                                                                             {'loss': '0.2656', 'grad_norm': '0.342', 'learning_rate': '7.414e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 27148288, 'tokens/trainable': 26835420, 'epoch': '5.002'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3314/5680 [8:32:33<5:26:47,  8.29s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3315/5680 [8:32:41<5:23:20,  8.20s/it]                                                                                                                                                                                                                                             {'loss': '0.59', 'grad_norm': '0.4237', 'learning_rate': '7.408e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27156480, 'tokens/trainable': 26843592, 'epoch': '5.003'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3315/5680 [8:32:41<5:23:20,  8.20s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3316/5680 [8:32:49<5:20:31,  8.14s/it]                                                                                                                                                                                                                                             {'loss': '0.5151', 'grad_norm': '0.3751', 'learning_rate': '7.403e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27164672, 'tokens/trainable': 26851768, 'epoch': '5.003'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3316/5680 [8:32:49<5:20:31,  8.14s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3317/5680 [8:32:57<5:19:00,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.4918', 'grad_norm': '0.3336', 'learning_rate': '7.398e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27172864, 'tokens/trainable': 26859956, 'epoch': '5.003'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                | 3317/5680 [8:32:57<5:19:00,  8.10s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 3318/5680 [8:33:06<5:18:06,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.7331', 'grad_norm': '0.4236', 'learning_rate': '7.392e-05', 'ppl': '2.082', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 27181056, 'tokens/trainable': 26868088, 'epoch': '5.003'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 3318/5680 [8:33:06<5:18:06,  8.08s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 3319/5680 [8:33:14<5:17:35,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.3698', 'grad_norm': '0.3343', 'learning_rate': '7.387e-05', 'ppl': '1.447', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 27189248, 'tokens/trainable': 26876220, 'epoch': '5.003'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 3319/5680 [8:33:14<5:17:35,  8.07s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 3320/5680 [8:33:22<5:16:27,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.4568', 'grad_norm': '0.3963', 'learning_rate': '7.382e-05', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27197440, 'tokens/trainable': 26884382, 'epoch': '5.004'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 3320/5680 [8:33:22<5:16:27,  8.05s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3321/5680 [8:33:30<5:15:45,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.344', 'grad_norm': '0.3748', 'learning_rate': '7.376e-05', 'ppl': '1.411', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 27205632, 'tokens/trainable': 26892536, 'epoch': '5.004'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3321/5680 [8:33:30<5:15:45,  8.03s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3322/5680 [8:33:38<5:15:15,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6807', 'grad_norm': '0.39', 'learning_rate': '7.371e-05', 'ppl': '1.975', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27213824, 'tokens/trainable': 26900700, 'epoch': '5.004'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3322/5680 [8:33:38<5:15:15,  8.02s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3323/5680 [8:33:46<5:14:52,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5996', 'grad_norm': '0.4057', 'learning_rate': '7.366e-05', 'ppl': '1.821', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 27222016, 'tokens/trainable': 26908844, 'epoch': '5.004'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3323/5680 [8:33:46<5:14:52,  8.02s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3324/5680 [8:33:53<5:13:57,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6923', 'grad_norm': '0.3686', 'learning_rate': '7.36e-05', 'ppl': '1.998', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27230208, 'tokens/trainable': 26916956, 'epoch': '5.004'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 3324/5680 [8:33:53<5:13:57,  8.00s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3325/5680 [8:34:02<5:14:14,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3875', 'grad_norm': '0.3369', 'learning_rate': '7.355e-05', 'ppl': '1.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 27238400, 'tokens/trainable': 26925096, 'epoch': '5.004'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3325/5680 [8:34:02<5:14:14,  8.01s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3326/5680 [8:34:10<5:14:03,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3716', 'grad_norm': '0.3173', 'learning_rate': '7.35e-05', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27246592, 'tokens/trainable': 26933248, 'epoch': '5.005'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3326/5680 [8:34:10<5:14:03,  8.00s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3327/5680 [8:34:18<5:13:54,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3557', 'grad_norm': '0.352', 'learning_rate': '7.344e-05', 'ppl': '1.427', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27254784, 'tokens/trainable': 26941440, 'epoch': '5.005'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3327/5680 [8:34:18<5:13:54,  8.00s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3328/5680 [8:34:26<5:14:05,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5656', 'grad_norm': '0.3493', 'learning_rate': '7.339e-05', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 27262976, 'tokens/trainable': 26949628, 'epoch': '5.005'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 3328/5680 [8:34:26<5:14:05,  8.01s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 3329/5680 [8:34:34<5:13:46,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6962', 'grad_norm': '0.4393', 'learning_rate': '7.334e-05', 'ppl': '2.006', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27271168, 'tokens/trainable': 26957800, 'epoch': '5.005'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 3329/5680 [8:34:34<5:13:46,  8.01s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 3330/5680 [8:34:42<5:14:15,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.3494', 'grad_norm': '0.3555', 'learning_rate': '7.328e-05', 'ppl': '1.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 27279360, 'tokens/trainable': 26965952, 'epoch': '5.005'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 3330/5680 [8:34:42<5:14:15,  8.02s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 3331/5680 [8:34:50<5:14:24,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4982', 'grad_norm': '0.3681', 'learning_rate': '7.323e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 27287552, 'tokens/trainable': 26974074, 'epoch': '5.005'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 3331/5680 [8:34:50<5:14:24,  8.03s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3332/5680 [8:34:58<5:14:07,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.6332', 'grad_norm': '0.3575', 'learning_rate': '7.318e-05', 'ppl': '1.884', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 27295744, 'tokens/trainable': 26982228, 'epoch': '5.006'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3332/5680 [8:34:58<5:14:07,  8.03s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3333/5680 [8:35:06<5:14:02,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.7612', 'grad_norm': '0.3734', 'learning_rate': '7.312e-05', 'ppl': '2.141', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 27303936, 'tokens/trainable': 26990360, 'epoch': '5.006'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3333/5680 [8:35:06<5:14:02,  8.03s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3334/5680 [8:35:14<5:13:26,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6096', 'grad_norm': '0.4134', 'learning_rate': '7.307e-05', 'ppl': '1.84', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27312128, 'tokens/trainable': 26998526, 'epoch': '5.006'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3334/5680 [8:35:14<5:13:26,  8.02s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3335/5680 [8:35:22<5:13:29,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5307', 'grad_norm': '0.3802', 'learning_rate': '7.302e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 27320320, 'tokens/trainable': 27006704, 'epoch': '5.006'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3335/5680 [8:35:22<5:13:29,  8.02s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3336/5680 [8:35:30<5:12:59,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7115', 'grad_norm': '0.4242', 'learning_rate': '7.296e-05', 'ppl': '2.037', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27328512, 'tokens/trainable': 27014842, 'epoch': '5.006'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3336/5680 [8:35:30<5:12:59,  8.01s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3337/5680 [8:35:38<5:12:42,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6963', 'grad_norm': '0.4133', 'learning_rate': '7.291e-05', 'ppl': '2.006', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 27336704, 'tokens/trainable': 27023000, 'epoch': '5.007'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3337/5680 [8:35:38<5:12:42,  8.01s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3338/5680 [8:35:46<5:12:08,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5995', 'grad_norm': '0.3937', 'learning_rate': '7.286e-05', 'ppl': '1.821', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27344896, 'tokens/trainable': 27031158, 'epoch': '5.007'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3338/5680 [8:35:46<5:12:08,  8.00s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3339/5680 [8:35:54<5:11:55,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4793', 'grad_norm': '0.3392', 'learning_rate': '7.28e-05', 'ppl': '1.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27353088, 'tokens/trainable': 27039336, 'epoch': '5.007'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 3339/5680 [8:35:54<5:11:55,  7.99s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 3340/5680 [8:36:02<5:11:34,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3971', 'grad_norm': '0.3102', 'learning_rate': '7.275e-05', 'ppl': '1.487', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27361280, 'tokens/trainable': 27047464, 'epoch': '5.007'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 3340/5680 [8:36:02<5:11:34,  7.99s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 3341/5680 [8:36:10<5:11:59,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4502', 'grad_norm': '0.3589', 'learning_rate': '7.27e-05', 'ppl': '1.569', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 27369472, 'tokens/trainable': 27055588, 'epoch': '5.007'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 3341/5680 [8:36:10<5:11:59,  8.00s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 3342/5680 [8:36:18<5:11:31,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7675', 'grad_norm': '0.5418', 'learning_rate': '7.264e-05', 'ppl': '2.154', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27377664, 'tokens/trainable': 27063744, 'epoch': '5.007'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 3342/5680 [8:36:18<5:11:31,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3343/5680 [8:36:26<5:11:19,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4126', 'grad_norm': '0.3831', 'learning_rate': '7.259e-05', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27385856, 'tokens/trainable': 27071928, 'epoch': '5.008'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3343/5680 [8:36:26<5:11:19,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3344/5680 [8:36:34<5:11:06,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4449', 'grad_norm': '0.3288', 'learning_rate': '7.254e-05', 'ppl': '1.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27394048, 'tokens/trainable': 27080104, 'epoch': '5.008'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3344/5680 [8:36:34<5:11:06,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3345/5680 [8:36:42<5:10:59,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5327', 'grad_norm': '0.4353', 'learning_rate': '7.248e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 27402240, 'tokens/trainable': 27088236, 'epoch': '5.008'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3345/5680 [8:36:42<5:10:59,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3346/5680 [8:36:50<5:10:51,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4238', 'grad_norm': '0.3431', 'learning_rate': '7.243e-05', 'ppl': '1.528', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27410432, 'tokens/trainable': 27096424, 'epoch': '5.008'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 3346/5680 [8:36:50<5:10:51,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3347/5680 [8:36:58<5:10:39,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5107', 'grad_norm': '0.4185', 'learning_rate': '7.238e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27418624, 'tokens/trainable': 27104594, 'epoch': '5.008'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3347/5680 [8:36:58<5:10:39,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3348/5680 [8:37:06<5:10:11,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6069', 'grad_norm': '0.4041', 'learning_rate': '7.232e-05', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 27426816, 'tokens/trainable': 27112776, 'epoch': '5.008'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3348/5680 [8:37:06<5:10:11,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3349/5680 [8:37:14<5:10:02,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3724', 'grad_norm': '0.2953', 'learning_rate': '7.227e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27435008, 'tokens/trainable': 27120924, 'epoch': '5.009'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3349/5680 [8:37:14<5:10:02,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3350/5680 [8:37:22<5:09:53,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6706', 'grad_norm': '0.3918', 'learning_rate': '7.222e-05', 'ppl': '1.955', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27443200, 'tokens/trainable': 27129096, 'epoch': '5.009'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 3350/5680 [8:37:22<5:09:53,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3351/5680 [8:37:30<5:13:20,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.4306', 'grad_norm': '0.4031', 'learning_rate': '7.216e-05', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.5', 'tokens/total': 27451392, 'tokens/trainable': 27137278, 'epoch': '5.009'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3351/5680 [8:37:30<5:13:20,  8.07s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3352/5680 [8:37:38<5:12:21,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.6998', 'grad_norm': '0.3502', 'learning_rate': '7.211e-05', 'ppl': '2.013', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27459584, 'tokens/trainable': 27145456, 'epoch': '5.009'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3352/5680 [8:37:38<5:12:21,  8.05s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3353/5680 [8:37:46<5:12:40,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.8274', 'grad_norm': '0.4269', 'learning_rate': '7.206e-05', 'ppl': '2.287', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 27467776, 'tokens/trainable': 27153608, 'epoch': '5.009'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3353/5680 [8:37:46<5:12:40,  8.06s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3354/5680 [8:37:54<5:12:36,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7298', 'grad_norm': '0.4004', 'learning_rate': '7.201e-05', 'ppl': '2.075', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 27475968, 'tokens/trainable': 27161766, 'epoch': '5.01'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 3354/5680 [8:37:54<5:12:36,  8.06s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 3355/5680 [8:38:02<5:12:59,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.4625', 'grad_norm': '0.3954', 'learning_rate': '7.195e-05', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 27484160, 'tokens/trainable': 27169908, 'epoch': '5.01'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 3355/5680 [8:38:02<5:12:59,  8.08s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 3356/5680 [8:38:10<5:12:27,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.5282', 'grad_norm': '0.3184', 'learning_rate': '7.19e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 27492352, 'tokens/trainable': 27178068, 'epoch': '5.01'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 3356/5680 [8:38:10<5:12:27,  8.07s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 3357/5680 [8:38:18<5:11:48,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.4987', 'grad_norm': '0.412', 'learning_rate': '7.185e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27500544, 'tokens/trainable': 27186244, 'epoch': '5.01'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 3357/5680 [8:38:18<5:11:48,  8.05s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3358/5680 [8:38:26<5:11:04,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7087', 'grad_norm': '0.4464', 'learning_rate': '7.179e-05', 'ppl': '2.031', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27508736, 'tokens/trainable': 27194398, 'epoch': '5.01'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3358/5680 [8:38:26<5:11:04,  8.04s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3359/5680 [8:38:34<5:10:25,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5762', 'grad_norm': '0.4486', 'learning_rate': '7.174e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 27516928, 'tokens/trainable': 27202536, 'epoch': '5.01'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3359/5680 [8:38:34<5:10:25,  8.02s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3360/5680 [8:38:42<5:08:57,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5898', 'grad_norm': '0.3764', 'learning_rate': '7.169e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27525120, 'tokens/trainable': 27210636, 'epoch': '5.011'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3360/5680 [8:38:42<5:08:57,  7.99s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3361/5680 [8:38:50<5:08:14,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4158', 'grad_norm': '0.3984', 'learning_rate': '7.163e-05', 'ppl': '1.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 27533312, 'tokens/trainable': 27218798, 'epoch': '5.011'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 3361/5680 [8:38:50<5:08:14,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3362/5680 [8:38:58<5:07:38,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4691', 'grad_norm': '0.3875', 'learning_rate': '7.158e-05', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27541504, 'tokens/trainable': 27226922, 'epoch': '5.011'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3362/5680 [8:38:58<5:07:38,  7.96s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3363/5680 [8:39:06<5:07:47,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4712', 'grad_norm': '0.3689', 'learning_rate': '7.153e-05', 'ppl': '1.602', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27549696, 'tokens/trainable': 27235104, 'epoch': '5.011'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3363/5680 [8:39:06<5:07:47,  7.97s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3364/5680 [8:39:14<5:07:58,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7725', 'grad_norm': '0.4156', 'learning_rate': '7.147e-05', 'ppl': '2.165', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27557888, 'tokens/trainable': 27243276, 'epoch': '5.011'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3364/5680 [8:39:14<5:07:58,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3365/5680 [8:39:22<5:08:05,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3762', 'grad_norm': '0.3623', 'learning_rate': '7.142e-05', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 27566080, 'tokens/trainable': 27251420, 'epoch': '5.011'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 3365/5680 [8:39:22<5:08:05,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 3366/5680 [8:39:30<5:07:42,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4272', 'grad_norm': '0.4807', 'learning_rate': '7.137e-05', 'ppl': '1.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27574272, 'tokens/trainable': 27259576, 'epoch': '5.012'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 3366/5680 [8:39:30<5:07:42,  7.98s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 3367/5680 [8:39:38<5:07:09,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5376', 'grad_norm': '0.3697', 'learning_rate': '7.132e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 27582464, 'tokens/trainable': 27267752, 'epoch': '5.012'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 3367/5680 [8:39:38<5:07:09,  7.97s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 3368/5680 [8:39:46<5:06:55,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6474', 'grad_norm': '0.3968', 'learning_rate': '7.126e-05', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27590656, 'tokens/trainable': 27275888, 'epoch': '5.012'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 3368/5680 [8:39:46<5:06:55,  7.97s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3369/5680 [8:39:54<5:06:03,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5804', 'grad_norm': '0.346', 'learning_rate': '7.121e-05', 'ppl': '1.787', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 27598848, 'tokens/trainable': 27284008, 'epoch': '5.012'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3369/5680 [8:39:54<5:06:03,  7.95s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3370/5680 [8:40:02<5:06:28,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3278', 'grad_norm': '0.343', 'learning_rate': '7.116e-05', 'ppl': '1.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 27607040, 'tokens/trainable': 27292098, 'epoch': '5.012'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3370/5680 [8:40:02<5:06:28,  7.96s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3371/5680 [8:40:10<5:05:37,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7106', 'grad_norm': '0.3496', 'learning_rate': '7.11e-05', 'ppl': '2.035', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 27615232, 'tokens/trainable': 27300268, 'epoch': '5.013'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3371/5680 [8:40:10<5:05:37,  7.94s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3372/5680 [8:40:18<5:06:15,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5262', 'grad_norm': '0.4072', 'learning_rate': '7.105e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27623424, 'tokens/trainable': 27308448, 'epoch': '5.013'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 3372/5680 [8:40:18<5:06:15,  7.96s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3373/5680 [8:40:26<5:06:12,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3878', 'grad_norm': '0.3082', 'learning_rate': '7.1e-05', 'ppl': '1.474', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27631616, 'tokens/trainable': 27316566, 'epoch': '5.013'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3373/5680 [8:40:26<5:06:12,  7.96s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3374/5680 [8:40:34<5:06:13,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4799', 'grad_norm': '0.4187', 'learning_rate': '7.095e-05', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27639808, 'tokens/trainable': 27324688, 'epoch': '5.013'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3374/5680 [8:40:34<5:06:13,  7.97s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3375/5680 [8:40:41<5:05:31,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4458', 'grad_norm': '0.3585', 'learning_rate': '7.089e-05', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 27648000, 'tokens/trainable': 27332868, 'epoch': '5.013'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3375/5680 [8:40:41<5:05:31,  7.95s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3376/5680 [8:40:49<5:05:41,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6303', 'grad_norm': '0.3656', 'learning_rate': '7.084e-05', 'ppl': '1.878', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27656192, 'tokens/trainable': 27341040, 'epoch': '5.013'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 3376/5680 [8:40:49<5:05:41,  7.96s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3377/5680 [8:40:57<5:05:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3376', 'grad_norm': '0.329', 'learning_rate': '7.079e-05', 'ppl': '1.402', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 27664384, 'tokens/trainable': 27349224, 'epoch': '5.014'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3377/5680 [8:40:57<5:05:45,  7.97s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3378/5680 [8:41:05<5:04:40,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3746', 'grad_norm': '0.3249', 'learning_rate': '7.073e-05', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27672576, 'tokens/trainable': 27357296, 'epoch': '5.014'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3378/5680 [8:41:05<5:04:40,  7.94s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3379/5680 [8:41:13<5:04:40,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2651', 'grad_norm': '0.3295', 'learning_rate': '7.068e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27680768, 'tokens/trainable': 27365420, 'epoch': '5.014'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3379/5680 [8:41:13<5:04:40,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3380/5680 [8:41:21<5:04:15,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6317', 'grad_norm': '0.3656', 'learning_rate': '7.063e-05', 'ppl': '1.881', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 27688960, 'tokens/trainable': 27373546, 'epoch': '5.014'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3380/5680 [8:41:21<5:04:15,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3381/5680 [8:41:29<5:04:25,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5179', 'grad_norm': '0.3133', 'learning_rate': '7.057e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 27697152, 'tokens/trainable': 27381668, 'epoch': '5.014'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3381/5680 [8:41:29<5:04:25,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3382/5680 [8:41:37<5:04:13,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.333', 'grad_norm': '0.3781', 'learning_rate': '7.052e-05', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 27705344, 'tokens/trainable': 27389834, 'epoch': '5.014'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3382/5680 [8:41:37<5:04:13,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3383/5680 [8:41:45<5:03:50,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3467', 'grad_norm': '0.4059', 'learning_rate': '7.047e-05', 'ppl': '1.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 27713536, 'tokens/trainable': 27397960, 'epoch': '5.015'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3383/5680 [8:41:45<5:03:50,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3384/5680 [8:41:53<5:03:39,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5815', 'grad_norm': '0.4807', 'learning_rate': '7.042e-05', 'ppl': '1.789', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 27721728, 'tokens/trainable': 27406096, 'epoch': '5.015'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3384/5680 [8:41:53<5:03:39,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3385/5680 [8:42:01<5:03:32,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5298', 'grad_norm': '0.3904', 'learning_rate': '7.036e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 27729920, 'tokens/trainable': 27414266, 'epoch': '5.015'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3385/5680 [8:42:01<5:03:32,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3386/5680 [8:42:09<5:03:32,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2845', 'grad_norm': '0.4479', 'learning_rate': '7.031e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 27738112, 'tokens/trainable': 27422424, 'epoch': '5.015'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3386/5680 [8:42:09<5:03:32,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3387/5680 [8:42:17<5:03:25,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4551', 'grad_norm': '0.317', 'learning_rate': '7.026e-05', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 27746304, 'tokens/trainable': 27430560, 'epoch': '5.015'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 3387/5680 [8:42:17<5:03:25,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 3388/5680 [8:42:25<5:03:13,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3937', 'grad_norm': '0.307', 'learning_rate': '7.021e-05', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 27754496, 'tokens/trainable': 27438740, 'epoch': '5.015'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 3388/5680 [8:42:25<5:03:13,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 3389/5680 [8:42:33<5:02:28,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5732', 'grad_norm': '0.3624', 'learning_rate': '7.015e-05', 'ppl': '1.774', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 27762688, 'tokens/trainable': 27446888, 'epoch': '5.016'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 3389/5680 [8:42:33<5:02:28,  7.92s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 3390/5680 [8:42:40<5:02:01,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3839', 'grad_norm': '0.3501', 'learning_rate': '7.01e-05', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 27770880, 'tokens/trainable': 27455022, 'epoch': '5.016'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 3390/5680 [8:42:40<5:02:01,  7.91s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3391/5680 [8:42:48<5:02:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3911', 'grad_norm': '0.3306', 'learning_rate': '7.005e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 27779072, 'tokens/trainable': 27463122, 'epoch': '5.016'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3391/5680 [8:42:48<5:02:47,  7.94s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3392/5680 [8:42:56<5:02:29,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5521', 'grad_norm': '0.3597', 'learning_rate': '6.999e-05', 'ppl': '1.737', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 27787264, 'tokens/trainable': 27471288, 'epoch': '5.016'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3392/5680 [8:42:56<5:02:29,  7.93s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3393/5680 [8:43:04<5:02:50,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3633', 'grad_norm': '0.3661', 'learning_rate': '6.994e-05', 'ppl': '1.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 27795456, 'tokens/trainable': 27479394, 'epoch': '5.016'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3393/5680 [8:43:04<5:02:50,  7.95s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3394/5680 [8:43:13<5:06:22,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7751', 'grad_norm': '0.3915', 'learning_rate': '6.989e-05', 'ppl': '2.171', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980.6', 'tokens/total': 27803648, 'tokens/trainable': 27487498, 'epoch': '5.017'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                             | 3394/5680 [8:43:13<5:06:22,  8.04s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3395/5680 [8:43:21<5:05:26,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.6595', 'grad_norm': '0.3483', 'learning_rate': '6.984e-05', 'ppl': '1.934', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27811840, 'tokens/trainable': 27495644, 'epoch': '5.017'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3395/5680 [8:43:21<5:05:26,  8.02s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3396/5680 [8:43:29<5:05:22,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.3162', 'grad_norm': '0.3491', 'learning_rate': '6.978e-05', 'ppl': '1.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 27820032, 'tokens/trainable': 27503730, 'epoch': '5.017'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3396/5680 [8:43:29<5:05:22,  8.02s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3397/5680 [8:43:37<5:04:09,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7518', 'grad_norm': '0.365', 'learning_rate': '6.973e-05', 'ppl': '2.121', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27828224, 'tokens/trainable': 27511824, 'epoch': '5.017'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3397/5680 [8:43:37<5:04:09,  7.99s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3398/5680 [8:43:45<5:03:42,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7829', 'grad_norm': '0.5555', 'learning_rate': '6.968e-05', 'ppl': '2.188', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27836416, 'tokens/trainable': 27519976, 'epoch': '5.017'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3398/5680 [8:43:45<5:03:42,  7.99s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3399/5680 [8:43:52<5:03:02,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3949', 'grad_norm': '0.3676', 'learning_rate': '6.962e-05', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27844608, 'tokens/trainable': 27528096, 'epoch': '5.017'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3399/5680 [8:43:52<5:03:02,  7.97s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3400/5680 [8:44:00<5:01:57,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6297', 'grad_norm': '0.3594', 'learning_rate': '6.957e-05', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 27852800, 'tokens/trainable': 27536248, 'epoch': '5.018'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3400/5680 [8:44:00<5:01:57,  7.95s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3401/5680 [8:44:08<5:02:35,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4582', 'grad_norm': '0.4667', 'learning_rate': '6.952e-05', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 27860992, 'tokens/trainable': 27544376, 'epoch': '5.018'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3401/5680 [8:44:08<5:02:35,  7.97s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3402/5680 [8:44:16<5:01:52,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4079', 'grad_norm': '0.3187', 'learning_rate': '6.947e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 27869184, 'tokens/trainable': 27552532, 'epoch': '5.018'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 3402/5680 [8:44:16<5:01:52,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 3403/5680 [8:44:24<5:01:44,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4536', 'grad_norm': '0.3506', 'learning_rate': '6.941e-05', 'ppl': '1.574', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 27877376, 'tokens/trainable': 27560628, 'epoch': '5.018'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 3403/5680 [8:44:24<5:01:44,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 3404/5680 [8:44:32<5:01:31,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5445', 'grad_norm': '0.3957', 'learning_rate': '6.936e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 27885568, 'tokens/trainable': 27568776, 'epoch': '5.018'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 3404/5680 [8:44:32<5:01:31,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 3405/5680 [8:44:40<5:01:22,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4549', 'grad_norm': '0.3999', 'learning_rate': '6.931e-05', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 27893760, 'tokens/trainable': 27576910, 'epoch': '5.018'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 3405/5680 [8:44:40<5:01:22,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3406/5680 [8:44:48<5:01:08,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4332', 'grad_norm': '0.3523', 'learning_rate': '6.926e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 27901952, 'tokens/trainable': 27585080, 'epoch': '5.019'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3406/5680 [8:44:48<5:01:08,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3407/5680 [8:44:56<5:01:17,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.325', 'grad_norm': '0.3395', 'learning_rate': '6.92e-05', 'ppl': '1.384', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 27910144, 'tokens/trainable': 27593212, 'epoch': '5.019'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3407/5680 [8:44:56<5:01:17,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3408/5680 [8:45:04<5:00:38,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3741', 'grad_norm': '0.3627', 'learning_rate': '6.915e-05', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 27918336, 'tokens/trainable': 27601384, 'epoch': '5.019'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3408/5680 [8:45:04<5:00:38,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3409/5680 [8:45:12<5:00:13,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6421', 'grad_norm': '0.4491', 'learning_rate': '6.91e-05', 'ppl': '1.9', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 27926528, 'tokens/trainable': 27609562, 'epoch': '5.019'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 3409/5680 [8:45:12<5:00:13,  7.93s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3410/5680 [8:45:20<5:00:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5647', 'grad_norm': '0.4098', 'learning_rate': '6.905e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 27934720, 'tokens/trainable': 27617738, 'epoch': '5.019'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3410/5680 [8:45:20<5:00:17,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3411/5680 [8:45:28<5:00:13,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3961', 'grad_norm': '0.3348', 'learning_rate': '6.899e-05', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 27942912, 'tokens/trainable': 27625916, 'epoch': '5.02'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3411/5680 [8:45:28<5:00:13,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3412/5680 [8:45:36<5:00:27,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4887', 'grad_norm': '0.4092', 'learning_rate': '6.894e-05', 'ppl': '1.63', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 27951104, 'tokens/trainable': 27634072, 'epoch': '5.02'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3412/5680 [8:45:36<5:00:27,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3413/5680 [8:45:44<5:00:05,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6437', 'grad_norm': '0.3404', 'learning_rate': '6.889e-05', 'ppl': '1.904', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 27959296, 'tokens/trainable': 27642168, 'epoch': '5.02'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 3413/5680 [8:45:44<5:00:05,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 3414/5680 [8:45:52<5:00:42,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5546', 'grad_norm': '0.4112', 'learning_rate': '6.884e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 27967488, 'tokens/trainable': 27650312, 'epoch': '5.02'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 3414/5680 [8:45:52<5:00:42,  7.96s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 3415/5680 [8:46:00<4:59:55,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5534', 'grad_norm': '0.3321', 'learning_rate': '6.878e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 27975680, 'tokens/trainable': 27658486, 'epoch': '5.02'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 3415/5680 [8:46:00<4:59:55,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 3416/5680 [8:46:07<4:59:36,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4434', 'grad_norm': '0.3803', 'learning_rate': '6.873e-05', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 27983872, 'tokens/trainable': 27666616, 'epoch': '5.02'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 3416/5680 [8:46:07<4:59:36,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3417/5680 [8:46:15<4:59:34,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2611', 'grad_norm': '0.3272', 'learning_rate': '6.868e-05', 'ppl': '1.298', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 27992064, 'tokens/trainable': 27674776, 'epoch': '5.021'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3417/5680 [8:46:15<4:59:34,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3418/5680 [8:46:23<5:00:02,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4086', 'grad_norm': '0.3744', 'learning_rate': '6.863e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28000256, 'tokens/trainable': 27682916, 'epoch': '5.021'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3418/5680 [8:46:23<5:00:02,  7.96s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3419/5680 [8:46:31<5:00:17,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6171', 'grad_norm': '0.4568', 'learning_rate': '6.857e-05', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28008448, 'tokens/trainable': 27691052, 'epoch': '5.021'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3419/5680 [8:46:31<5:00:17,  7.97s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3420/5680 [8:46:39<4:59:41,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4961', 'grad_norm': '0.3854', 'learning_rate': '6.852e-05', 'ppl': '1.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28016640, 'tokens/trainable': 27699242, 'epoch': '5.021'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3420/5680 [8:46:39<4:59:41,  7.96s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3421/5680 [8:46:47<4:59:44,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4282', 'grad_norm': '0.3823', 'learning_rate': '6.847e-05', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 28024832, 'tokens/trainable': 27707376, 'epoch': '5.021'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3421/5680 [8:46:47<4:59:44,  7.96s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3422/5680 [8:46:56<5:03:20,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.2703', 'grad_norm': '0.3309', 'learning_rate': '6.842e-05', 'ppl': '1.31', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.4', 'tokens/total': 28033024, 'tokens/trainable': 27715536, 'epoch': '5.021'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3422/5680 [8:46:56<5:03:20,  8.06s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3423/5680 [8:47:04<5:01:48,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5214', 'grad_norm': '0.4406', 'learning_rate': '6.836e-05', 'ppl': '1.684', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28041216, 'tokens/trainable': 27723696, 'epoch': '5.022'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3423/5680 [8:47:04<5:01:48,  8.02s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3424/5680 [8:47:11<4:59:57,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5724', 'grad_norm': '0.3753', 'learning_rate': '6.831e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 28049408, 'tokens/trainable': 27731850, 'epoch': '5.022'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 3424/5680 [8:47:11<4:59:57,  7.98s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 3425/5680 [8:47:19<4:59:06,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6772', 'grad_norm': '0.3831', 'learning_rate': '6.826e-05', 'ppl': '1.968', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28057600, 'tokens/trainable': 27740010, 'epoch': '5.022'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 3425/5680 [8:47:19<4:59:06,  7.96s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 3426/5680 [8:47:27<4:58:32,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7264', 'grad_norm': '0.5139', 'learning_rate': '6.821e-05', 'ppl': '2.068', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28065792, 'tokens/trainable': 27748192, 'epoch': '5.022'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 3426/5680 [8:47:27<4:58:32,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 3427/5680 [8:47:35<4:58:20,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4469', 'grad_norm': '0.3343', 'learning_rate': '6.815e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28073984, 'tokens/trainable': 27756376, 'epoch': '5.022'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 3427/5680 [8:47:35<4:58:20,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3428/5680 [8:47:43<4:58:12,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7245', 'grad_norm': '0.4062', 'learning_rate': '6.81e-05', 'ppl': '2.064', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 28082176, 'tokens/trainable': 27764552, 'epoch': '5.023'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3428/5680 [8:47:43<4:58:12,  7.95s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3429/5680 [8:47:51<4:57:59,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3556', 'grad_norm': '0.4404', 'learning_rate': '6.805e-05', 'ppl': '1.427', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28090368, 'tokens/trainable': 27772686, 'epoch': '5.023'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3429/5680 [8:47:51<4:57:59,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3430/5680 [8:47:59<4:57:55,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.9284', 'grad_norm': '0.476', 'learning_rate': '6.8e-05', 'ppl': '2.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28098560, 'tokens/trainable': 27780780, 'epoch': '5.023'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3430/5680 [8:47:59<4:57:55,  7.94s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3431/5680 [8:48:07<4:57:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5504', 'grad_norm': '0.4484', 'learning_rate': '6.794e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28106752, 'tokens/trainable': 27788912, 'epoch': '5.023'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3431/5680 [8:48:07<4:57:47,  7.94s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3432/5680 [8:48:15<4:57:51,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2842', 'grad_norm': '0.3194', 'learning_rate': '6.789e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 28114944, 'tokens/trainable': 27797028, 'epoch': '5.023'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3432/5680 [8:48:15<4:57:51,  7.95s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3433/5680 [8:48:23<4:57:21,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4087', 'grad_norm': '0.3169', 'learning_rate': '6.784e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28123136, 'tokens/trainable': 27805132, 'epoch': '5.023'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3433/5680 [8:48:23<4:57:21,  7.94s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3434/5680 [8:48:31<4:57:36,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6673', 'grad_norm': '0.398', 'learning_rate': '6.779e-05', 'ppl': '1.949', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28131328, 'tokens/trainable': 27813304, 'epoch': '5.024'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3434/5680 [8:48:31<4:57:36,  7.95s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3435/5680 [8:48:39<4:57:43,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7328', 'grad_norm': '0.5499', 'learning_rate': '6.773e-05', 'ppl': '2.081', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28139520, 'tokens/trainable': 27821458, 'epoch': '5.024'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3435/5680 [8:48:39<4:57:43,  7.96s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3436/5680 [8:48:47<4:57:25,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3263', 'grad_norm': '0.406', 'learning_rate': '6.768e-05', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28147712, 'tokens/trainable': 27829640, 'epoch': '5.024'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3436/5680 [8:48:47<4:57:25,  7.95s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3437/5680 [8:48:55<5:00:40,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.5959', 'grad_norm': '0.4279', 'learning_rate': '6.763e-05', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.1', 'tokens/total': 28155904, 'tokens/trainable': 27837786, 'epoch': '5.024'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3437/5680 [8:48:55<5:00:40,  8.04s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3438/5680 [8:49:03<4:59:21,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3477', 'grad_norm': '0.4289', 'learning_rate': '6.758e-05', 'ppl': '1.416', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28164096, 'tokens/trainable': 27845946, 'epoch': '5.024'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3438/5680 [8:49:03<4:59:21,  8.01s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3439/5680 [8:49:11<4:59:17,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3807', 'grad_norm': '0.397', 'learning_rate': '6.752e-05', 'ppl': '1.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28172288, 'tokens/trainable': 27854106, 'epoch': '5.024'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3439/5680 [8:49:11<4:59:17,  8.01s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 3440/5680 [8:49:19<4:58:21,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4253', 'grad_norm': '0.3908', 'learning_rate': '6.747e-05', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28180480, 'tokens/trainable': 27862244, 'epoch': '5.025'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 3440/5680 [8:49:19<4:58:21,  7.99s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 3441/5680 [8:49:27<4:58:04,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5299', 'grad_norm': '0.3797', 'learning_rate': '6.742e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 28188672, 'tokens/trainable': 27870376, 'epoch': '5.025'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 3441/5680 [8:49:27<4:58:04,  7.99s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 3442/5680 [8:49:35<4:57:13,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4472', 'grad_norm': '0.4605', 'learning_rate': '6.737e-05', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28196864, 'tokens/trainable': 27878532, 'epoch': '5.025'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                           | 3442/5680 [8:49:35<4:57:13,  7.97s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3443/5680 [8:49:43<4:57:42,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5132', 'grad_norm': '0.4145', 'learning_rate': '6.732e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 28205056, 'tokens/trainable': 27886640, 'epoch': '5.025'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3443/5680 [8:49:43<4:57:42,  7.98s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3444/5680 [8:49:51<4:57:17,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.2756', 'grad_norm': '0.3168', 'learning_rate': '6.726e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 28213248, 'tokens/trainable': 27894806, 'epoch': '5.025'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3444/5680 [8:49:51<4:57:17,  7.98s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3445/5680 [8:49:59<4:56:20,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8307', 'grad_norm': '0.3613', 'learning_rate': '6.721e-05', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28221440, 'tokens/trainable': 27902956, 'epoch': '5.026'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3445/5680 [8:49:59<4:56:20,  7.96s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3446/5680 [8:50:07<4:56:17,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3914', 'grad_norm': '0.3433', 'learning_rate': '6.716e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 28229632, 'tokens/trainable': 27911092, 'epoch': '5.026'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 3446/5680 [8:50:07<4:56:17,  7.96s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3447/5680 [8:50:15<4:55:47,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5846', 'grad_norm': '0.3672', 'learning_rate': '6.711e-05', 'ppl': '1.794', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28237824, 'tokens/trainable': 27919232, 'epoch': '5.026'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3447/5680 [8:50:15<4:55:47,  7.95s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3448/5680 [8:50:23<4:55:55,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.581', 'grad_norm': '0.398', 'learning_rate': '6.705e-05', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28246016, 'tokens/trainable': 27927416, 'epoch': '5.026'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3448/5680 [8:50:23<4:55:55,  7.95s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3449/5680 [8:50:30<4:55:39,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3913', 'grad_norm': '0.3381', 'learning_rate': '6.7e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28254208, 'tokens/trainable': 27935568, 'epoch': '5.026'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3449/5680 [8:50:30<4:55:39,  7.95s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3450/5680 [8:50:38<4:55:30,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3821', 'grad_norm': '0.3867', 'learning_rate': '6.695e-05', 'ppl': '1.465', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 28262400, 'tokens/trainable': 27943668, 'epoch': '5.026'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 3450/5680 [8:50:38<4:55:30,  7.95s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 3451/5680 [8:50:46<4:54:54,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4225', 'grad_norm': '0.3571', 'learning_rate': '6.69e-05', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28270592, 'tokens/trainable': 27951812, 'epoch': '5.027'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 3451/5680 [8:50:46<4:54:54,  7.94s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 3452/5680 [8:50:54<4:55:13,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5316', 'grad_norm': '0.3581', 'learning_rate': '6.685e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28278784, 'tokens/trainable': 27959980, 'epoch': '5.027'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 3452/5680 [8:50:54<4:55:13,  7.95s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 3453/5680 [8:51:02<4:54:49,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.614', 'grad_norm': '0.4061', 'learning_rate': '6.679e-05', 'ppl': '1.848', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28286976, 'tokens/trainable': 27968088, 'epoch': '5.027'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 3453/5680 [8:51:02<4:54:49,  7.94s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3454/5680 [8:51:10<4:54:17,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.4012', 'learning_rate': '6.674e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 28295168, 'tokens/trainable': 27976246, 'epoch': '5.027'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3454/5680 [8:51:10<4:54:17,  7.93s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3455/5680 [8:51:18<4:54:15,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2787', 'grad_norm': '0.3144', 'learning_rate': '6.669e-05', 'ppl': '1.321', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28303360, 'tokens/trainable': 27984368, 'epoch': '5.027'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3455/5680 [8:51:18<4:54:15,  7.94s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3456/5680 [8:51:26<4:54:04,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4056', 'grad_norm': '0.4741', 'learning_rate': '6.664e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 28311552, 'tokens/trainable': 27992472, 'epoch': '5.027'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3456/5680 [8:51:26<4:54:04,  7.93s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3457/5680 [8:51:34<4:54:07,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4212', 'grad_norm': '0.3635', 'learning_rate': '6.658e-05', 'ppl': '1.524', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28319744, 'tokens/trainable': 28000620, 'epoch': '5.028'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                           | 3457/5680 [8:51:34<4:54:07,  7.94s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3458/5680 [8:51:42<4:54:03,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5683', 'grad_norm': '0.3641', 'learning_rate': '6.653e-05', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 28327936, 'tokens/trainable': 28008792, 'epoch': '5.028'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3458/5680 [8:51:42<4:54:03,  7.94s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3459/5680 [8:51:50<4:53:40,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5763', 'grad_norm': '0.4027', 'learning_rate': '6.648e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 28336128, 'tokens/trainable': 28016976, 'epoch': '5.028'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3459/5680 [8:51:50<4:53:40,  7.93s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3460/5680 [8:51:58<4:53:16,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4626', 'grad_norm': '0.3858', 'learning_rate': '6.643e-05', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28344320, 'tokens/trainable': 28025128, 'epoch': '5.028'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3460/5680 [8:51:58<4:53:16,  7.93s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3461/5680 [8:52:06<4:53:32,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4391', 'grad_norm': '0.3806', 'learning_rate': '6.638e-05', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28352512, 'tokens/trainable': 28033276, 'epoch': '5.028'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 3461/5680 [8:52:06<4:53:32,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 3462/5680 [8:52:14<4:52:57,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5419', 'grad_norm': '0.3594', 'learning_rate': '6.632e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28360704, 'tokens/trainable': 28041406, 'epoch': '5.029'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 3462/5680 [8:52:14<4:52:57,  7.92s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 3463/5680 [8:52:21<4:52:40,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.7914', 'grad_norm': '0.4366', 'learning_rate': '6.627e-05', 'ppl': '2.207', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28368896, 'tokens/trainable': 28049572, 'epoch': '5.029'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 3463/5680 [8:52:21<4:52:40,  7.92s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 3464/5680 [8:52:29<4:53:16,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2847', 'grad_norm': '0.3881', 'learning_rate': '6.622e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 28377088, 'tokens/trainable': 28057724, 'epoch': '5.029'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 3464/5680 [8:52:29<4:53:16,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3465/5680 [8:52:37<4:53:14,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4958', 'grad_norm': '0.3955', 'learning_rate': '6.617e-05', 'ppl': '1.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28385280, 'tokens/trainable': 28065864, 'epoch': '5.029'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3465/5680 [8:52:37<4:53:14,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3466/5680 [8:52:45<4:53:11,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5798', 'grad_norm': '0.3867', 'learning_rate': '6.612e-05', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 28393472, 'tokens/trainable': 28073952, 'epoch': '5.029'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3466/5680 [8:52:45<4:53:11,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3467/5680 [8:52:53<4:53:26,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5484', 'grad_norm': '0.3665', 'learning_rate': '6.606e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 28401664, 'tokens/trainable': 28082064, 'epoch': '5.029'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3467/5680 [8:52:53<4:53:26,  7.96s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3468/5680 [8:53:01<4:53:00,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.434', 'grad_norm': '0.3712', 'learning_rate': '6.601e-05', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28409856, 'tokens/trainable': 28090212, 'epoch': '5.03'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 3468/5680 [8:53:01<4:53:00,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3469/5680 [8:53:09<4:52:59,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3618', 'grad_norm': '0.3291', 'learning_rate': '6.596e-05', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28418048, 'tokens/trainable': 28098388, 'epoch': '5.03'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3469/5680 [8:53:09<4:52:59,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3470/5680 [8:53:17<4:53:06,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4881', 'grad_norm': '0.4018', 'learning_rate': '6.591e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28426240, 'tokens/trainable': 28106506, 'epoch': '5.03'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3470/5680 [8:53:17<4:53:06,  7.96s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3471/5680 [8:53:25<4:52:57,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5318', 'grad_norm': '0.3763', 'learning_rate': '6.586e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 28434432, 'tokens/trainable': 28114612, 'epoch': '5.03'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3471/5680 [8:53:25<4:52:57,  7.96s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3472/5680 [8:53:33<4:52:35,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2959', 'grad_norm': '0.2786', 'learning_rate': '6.58e-05', 'ppl': '1.344', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28442624, 'tokens/trainable': 28122788, 'epoch': '5.03'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 3472/5680 [8:53:33<4:52:35,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3473/5680 [8:53:41<4:52:06,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6886', 'grad_norm': '0.4956', 'learning_rate': '6.575e-05', 'ppl': '1.991', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28450816, 'tokens/trainable': 28130918, 'epoch': '5.03'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3473/5680 [8:53:41<4:52:06,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3474/5680 [8:53:49<4:52:20,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3309', 'grad_norm': '0.4235', 'learning_rate': '6.57e-05', 'ppl': '1.392', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 28459008, 'tokens/trainable': 28139098, 'epoch': '5.031'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3474/5680 [8:53:49<4:52:20,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3475/5680 [8:53:57<4:52:02,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.433', 'grad_norm': '0.3328', 'learning_rate': '6.565e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 28467200, 'tokens/trainable': 28147236, 'epoch': '5.031'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3475/5680 [8:53:57<4:52:02,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3476/5680 [8:54:05<4:51:41,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3657', 'grad_norm': '0.3614', 'learning_rate': '6.56e-05', 'ppl': '1.442', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28475392, 'tokens/trainable': 28155382, 'epoch': '5.031'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 3476/5680 [8:54:05<4:51:41,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 3477/5680 [8:54:13<4:51:44,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.396', 'grad_norm': '0.3441', 'learning_rate': '6.554e-05', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 28483584, 'tokens/trainable': 28163564, 'epoch': '5.031'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 3477/5680 [8:54:13<4:51:44,  7.95s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 3478/5680 [8:54:21<4:51:18,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3946', 'grad_norm': '0.3378', 'learning_rate': '6.549e-05', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 28491776, 'tokens/trainable': 28171644, 'epoch': '5.031'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 3478/5680 [8:54:21<4:51:18,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 3479/5680 [8:54:29<4:51:22,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.612', 'grad_norm': '0.4099', 'learning_rate': '6.544e-05', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28499968, 'tokens/trainable': 28179776, 'epoch': '5.032'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 3479/5680 [8:54:29<4:51:22,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3480/5680 [8:54:37<4:54:39,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.5828', 'grad_norm': '0.4064', 'learning_rate': '6.539e-05', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989', 'tokens/total': 28508160, 'tokens/trainable': 28187936, 'epoch': '5.032'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3480/5680 [8:54:37<4:54:39,  8.04s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3481/5680 [8:54:45<4:53:28,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4074', 'grad_norm': '0.3676', 'learning_rate': '6.534e-05', 'ppl': '1.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28516352, 'tokens/trainable': 28196068, 'epoch': '5.032'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3481/5680 [8:54:45<4:53:28,  8.01s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3482/5680 [8:54:53<4:52:55,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3623', 'grad_norm': '0.3777', 'learning_rate': '6.528e-05', 'ppl': '1.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28524544, 'tokens/trainable': 28204176, 'epoch': '5.032'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3482/5680 [8:54:53<4:52:55,  8.00s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3483/5680 [8:55:01<4:52:07,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5903', 'grad_norm': '0.3387', 'learning_rate': '6.523e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28532736, 'tokens/trainable': 28212348, 'epoch': '5.032'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 3483/5680 [8:55:01<4:52:07,  7.98s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3484/5680 [8:55:09<4:51:54,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4822', 'grad_norm': '0.3636', 'learning_rate': '6.518e-05', 'ppl': '1.62', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 28540928, 'tokens/trainable': 28220490, 'epoch': '5.032'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3484/5680 [8:55:09<4:51:54,  7.98s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3485/5680 [8:55:17<4:51:19,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6514', 'grad_norm': '0.5023', 'learning_rate': '6.513e-05', 'ppl': '1.918', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28549120, 'tokens/trainable': 28228672, 'epoch': '5.033'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3485/5680 [8:55:17<4:51:19,  7.96s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3486/5680 [8:55:25<4:51:00,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4437', 'grad_norm': '0.4428', 'learning_rate': '6.508e-05', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 28557312, 'tokens/trainable': 28236764, 'epoch': '5.033'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3486/5680 [8:55:25<4:51:00,  7.96s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3487/5680 [8:55:33<4:50:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6748', 'grad_norm': '0.4351', 'learning_rate': '6.503e-05', 'ppl': '1.964', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 28565504, 'tokens/trainable': 28244920, 'epoch': '5.033'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 3487/5680 [8:55:33<4:50:17,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 3488/5680 [8:55:40<4:49:55,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.608', 'grad_norm': '0.3938', 'learning_rate': '6.497e-05', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28573696, 'tokens/trainable': 28253104, 'epoch': '5.033'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 3488/5680 [8:55:40<4:49:55,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 3489/5680 [8:55:48<4:49:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3987', 'grad_norm': '0.3683', 'learning_rate': '6.492e-05', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 28581888, 'tokens/trainable': 28261164, 'epoch': '5.033'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 3489/5680 [8:55:48<4:49:47,  7.94s/it] 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 3490/5680 [8:55:56<4:49:54,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5733', 'grad_norm': '0.4646', 'learning_rate': '6.487e-05', 'ppl': '1.774', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28590080, 'tokens/trainable': 28269316, 'epoch': '5.033'}
 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 3490/5680 [8:55:56<4:49:54,  7.94s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3491/5680 [8:56:04<4:49:28,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6035', 'grad_norm': '0.4002', 'learning_rate': '6.482e-05', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28598272, 'tokens/trainable': 28277492, 'epoch': '5.034'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3491/5680 [8:56:04<4:49:28,  7.93s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3492/5680 [8:56:12<4:49:56,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4833', 'grad_norm': '0.4053', 'learning_rate': '6.477e-05', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 28606464, 'tokens/trainable': 28285616, 'epoch': '5.034'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3492/5680 [8:56:12<4:49:56,  7.95s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3493/5680 [8:56:20<4:49:29,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4483', 'grad_norm': '0.3219', 'learning_rate': '6.471e-05', 'ppl': '1.566', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28614656, 'tokens/trainable': 28293748, 'epoch': '5.034'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3493/5680 [8:56:20<4:49:29,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3494/5680 [8:56:28<4:49:24,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3347', 'grad_norm': '0.3212', 'learning_rate': '6.466e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 28622848, 'tokens/trainable': 28301856, 'epoch': '5.034'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 3494/5680 [8:56:28<4:49:24,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3495/5680 [8:56:36<4:49:23,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3075', 'grad_norm': '0.3102', 'learning_rate': '6.461e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 28631040, 'tokens/trainable': 28309952, 'epoch': '5.034'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3495/5680 [8:56:36<4:49:23,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3496/5680 [8:56:44<4:49:12,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3146', 'grad_norm': '0.3119', 'learning_rate': '6.456e-05', 'ppl': '1.37', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28639232, 'tokens/trainable': 28318104, 'epoch': '5.035'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3496/5680 [8:56:44<4:49:12,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3497/5680 [8:56:52<4:49:16,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3486', 'grad_norm': '0.3407', 'learning_rate': '6.451e-05', 'ppl': '1.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 28647424, 'tokens/trainable': 28326220, 'epoch': '5.035'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3497/5680 [8:56:52<4:49:16,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3498/5680 [8:57:00<4:49:19,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7235', 'grad_norm': '0.5531', 'learning_rate': '6.446e-05', 'ppl': '2.062', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 28655616, 'tokens/trainable': 28334360, 'epoch': '5.035'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 3498/5680 [8:57:00<4:49:19,  7.96s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 3499/5680 [8:57:08<4:49:12,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4341', 'grad_norm': '0.4373', 'learning_rate': '6.44e-05', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28663808, 'tokens/trainable': 28342528, 'epoch': '5.035'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 3499/5680 [8:57:08<4:49:12,  7.96s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 3500/5680 [8:57:16<4:48:40,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4781', 'grad_norm': '0.4083', 'learning_rate': '6.435e-05', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28672000, 'tokens/trainable': 28350704, 'epoch': '5.035'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 3500/5680 [8:57:16<4:48:40,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 3501/5680 [8:57:24<4:48:58,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4075', 'grad_norm': '0.4167', 'learning_rate': '6.43e-05', 'ppl': '1.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28680192, 'tokens/trainable': 28358874, 'epoch': '5.035'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 3501/5680 [8:57:24<4:48:58,  7.96s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3502/5680 [8:57:32<4:48:23,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6355', 'grad_norm': '0.4598', 'learning_rate': '6.425e-05', 'ppl': '1.888', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28688384, 'tokens/trainable': 28367036, 'epoch': '5.036'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3502/5680 [8:57:32<4:48:23,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3503/5680 [8:57:40<4:48:25,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3832', 'grad_norm': '0.3503', 'learning_rate': '6.42e-05', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 28696576, 'tokens/trainable': 28375200, 'epoch': '5.036'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3503/5680 [8:57:40<4:48:25,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3504/5680 [8:57:48<4:48:09,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4505', 'grad_norm': '0.317', 'learning_rate': '6.415e-05', 'ppl': '1.569', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 28704768, 'tokens/trainable': 28383290, 'epoch': '5.036'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3504/5680 [8:57:48<4:48:09,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3505/5680 [8:57:56<4:47:55,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3761', 'grad_norm': '0.351', 'learning_rate': '6.409e-05', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28712960, 'tokens/trainable': 28391436, 'epoch': '5.036'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 3505/5680 [8:57:56<4:47:55,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3506/5680 [8:58:04<4:47:53,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5416', 'grad_norm': '0.4124', 'learning_rate': '6.404e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 28721152, 'tokens/trainable': 28399616, 'epoch': '5.036'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3506/5680 [8:58:04<4:47:53,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3507/5680 [8:58:11<4:47:42,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3801', 'grad_norm': '0.3061', 'learning_rate': '6.399e-05', 'ppl': '1.462', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28729344, 'tokens/trainable': 28407750, 'epoch': '5.036'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3507/5680 [8:58:11<4:47:42,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3508/5680 [8:58:19<4:47:36,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3077', 'grad_norm': '0.4089', 'learning_rate': '6.394e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28737536, 'tokens/trainable': 28415916, 'epoch': '5.037'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3508/5680 [8:58:19<4:47:36,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3509/5680 [8:58:27<4:47:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6426', 'grad_norm': '0.4927', 'learning_rate': '6.389e-05', 'ppl': '1.901', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28745728, 'tokens/trainable': 28424090, 'epoch': '5.037'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 3509/5680 [8:58:27<4:47:17,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3510/5680 [8:58:35<4:47:13,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.8562', 'grad_norm': '0.3987', 'learning_rate': '6.384e-05', 'ppl': '2.354', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28753920, 'tokens/trainable': 28432268, 'epoch': '5.037'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3510/5680 [8:58:35<4:47:13,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3511/5680 [8:58:43<4:47:24,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4875', 'grad_norm': '0.4047', 'learning_rate': '6.378e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28762112, 'tokens/trainable': 28440432, 'epoch': '5.037'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3511/5680 [8:58:43<4:47:24,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3512/5680 [8:58:51<4:47:06,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.8629', 'grad_norm': '0.491', 'learning_rate': '6.373e-05', 'ppl': '2.37', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28770304, 'tokens/trainable': 28448566, 'epoch': '5.037'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3512/5680 [8:58:51<4:47:06,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3513/5680 [8:58:59<4:47:18,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.719', 'grad_norm': '0.374', 'learning_rate': '6.368e-05', 'ppl': '2.052', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28778496, 'tokens/trainable': 28456736, 'epoch': '5.037'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 3513/5680 [8:58:59<4:47:18,  7.96s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 3514/5680 [8:59:07<4:46:34,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3239', 'grad_norm': '0.3221', 'learning_rate': '6.363e-05', 'ppl': '1.382', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28786688, 'tokens/trainable': 28464892, 'epoch': '5.038'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 3514/5680 [8:59:07<4:46:34,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 3515/5680 [8:59:15<4:46:54,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3782', 'grad_norm': '0.3628', 'learning_rate': '6.358e-05', 'ppl': '1.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28794880, 'tokens/trainable': 28473056, 'epoch': '5.038'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 3515/5680 [8:59:15<4:46:54,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 3516/5680 [8:59:23<4:46:11,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3371', 'grad_norm': '0.3826', 'learning_rate': '6.353e-05', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28803072, 'tokens/trainable': 28481172, 'epoch': '5.038'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 3516/5680 [8:59:23<4:46:11,  7.93s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3517/5680 [8:59:31<4:46:26,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5692', 'grad_norm': '0.4074', 'learning_rate': '6.348e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 28811264, 'tokens/trainable': 28489298, 'epoch': '5.038'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3517/5680 [8:59:31<4:46:26,  7.95s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3518/5680 [8:59:39<4:45:50,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4691', 'grad_norm': '0.3603', 'learning_rate': '6.342e-05', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 28819456, 'tokens/trainable': 28497428, 'epoch': '5.038'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3518/5680 [8:59:39<4:45:50,  7.93s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3519/5680 [8:59:47<4:45:50,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4914', 'grad_norm': '0.4273', 'learning_rate': '6.337e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 28827648, 'tokens/trainable': 28505540, 'epoch': '5.039'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3519/5680 [8:59:47<4:45:50,  7.94s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3520/5680 [8:59:55<4:46:20,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4051', 'grad_norm': '0.4304', 'learning_rate': '6.332e-05', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28835840, 'tokens/trainable': 28513720, 'epoch': '5.039'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 3520/5680 [8:59:55<4:46:20,  7.95s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3521/5680 [9:00:03<4:46:36,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4818', 'grad_norm': '0.3532', 'learning_rate': '6.327e-05', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 28844032, 'tokens/trainable': 28521892, 'epoch': '5.039'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3521/5680 [9:00:03<4:46:36,  7.97s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3522/5680 [9:00:11<4:45:42,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3222', 'grad_norm': '0.3794', 'learning_rate': '6.322e-05', 'ppl': '1.38', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 28852224, 'tokens/trainable': 28530068, 'epoch': '5.039'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3522/5680 [9:00:11<4:45:42,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3523/5680 [9:00:19<4:49:15,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5084', 'grad_norm': '0.3431', 'learning_rate': '6.317e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985', 'tokens/total': 28860416, 'tokens/trainable': 28538228, 'epoch': '5.039'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3523/5680 [9:00:19<4:49:15,  8.05s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3524/5680 [9:00:27<4:48:22,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4579', 'grad_norm': '0.451', 'learning_rate': '6.312e-05', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 28868608, 'tokens/trainable': 28546358, 'epoch': '5.039'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 3524/5680 [9:00:27<4:48:22,  8.03s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 3525/5680 [9:00:35<4:46:48,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4467', 'grad_norm': '0.3829', 'learning_rate': '6.306e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 28876800, 'tokens/trainable': 28554542, 'epoch': '5.04'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 3525/5680 [9:00:35<4:46:48,  7.99s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 3526/5680 [9:00:43<4:46:00,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.514', 'grad_norm': '0.4114', 'learning_rate': '6.301e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28884992, 'tokens/trainable': 28562680, 'epoch': '5.04'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 3526/5680 [9:00:43<4:46:00,  7.97s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 3527/5680 [9:00:51<4:45:14,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6642', 'grad_norm': '0.359', 'learning_rate': '6.296e-05', 'ppl': '1.943', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 28893184, 'tokens/trainable': 28570802, 'epoch': '5.04'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 3527/5680 [9:00:51<4:45:14,  7.95s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3528/5680 [9:00:59<4:44:53,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4704', 'grad_norm': '0.4346', 'learning_rate': '6.291e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28901376, 'tokens/trainable': 28578928, 'epoch': '5.04'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3528/5680 [9:00:59<4:44:53,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3529/5680 [9:01:07<4:45:36,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4348', 'grad_norm': '0.3772', 'learning_rate': '6.286e-05', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 28909568, 'tokens/trainable': 28587100, 'epoch': '5.04'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3529/5680 [9:01:07<4:45:36,  7.97s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3530/5680 [9:01:15<4:45:03,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3842', 'grad_norm': '0.3593', 'learning_rate': '6.281e-05', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 28917760, 'tokens/trainable': 28595248, 'epoch': '5.04'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3530/5680 [9:01:15<4:45:03,  7.96s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3531/5680 [9:01:22<4:44:14,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4048', 'grad_norm': '0.3985', 'learning_rate': '6.276e-05', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 28925952, 'tokens/trainable': 28603398, 'epoch': '5.041'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 3531/5680 [9:01:22<4:44:14,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3532/5680 [9:01:30<4:44:13,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4897', 'grad_norm': '0.385', 'learning_rate': '6.27e-05', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 28934144, 'tokens/trainable': 28611532, 'epoch': '5.041'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3532/5680 [9:01:30<4:44:13,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3533/5680 [9:01:38<4:44:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5071', 'grad_norm': '0.3614', 'learning_rate': '6.265e-05', 'ppl': '1.66', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 28942336, 'tokens/trainable': 28619664, 'epoch': '5.041'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3533/5680 [9:01:38<4:44:17,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3534/5680 [9:01:46<4:44:04,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2968', 'grad_norm': '0.3462', 'learning_rate': '6.26e-05', 'ppl': '1.346', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 28950528, 'tokens/trainable': 28627840, 'epoch': '5.041'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3534/5680 [9:01:46<4:44:04,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3535/5680 [9:01:54<4:43:50,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4449', 'grad_norm': '0.431', 'learning_rate': '6.255e-05', 'ppl': '1.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 28958720, 'tokens/trainable': 28636016, 'epoch': '5.041'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 3535/5680 [9:01:54<4:43:50,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 3536/5680 [9:02:02<4:43:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6319', 'grad_norm': '0.4072', 'learning_rate': '6.25e-05', 'ppl': '1.881', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 28966912, 'tokens/trainable': 28644188, 'epoch': '5.042'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 3536/5680 [9:02:02<4:43:47,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 3537/5680 [9:02:10<4:43:37,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5009', 'grad_norm': '0.3619', 'learning_rate': '6.245e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 28975104, 'tokens/trainable': 28652376, 'epoch': '5.042'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 3537/5680 [9:02:10<4:43:37,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 3538/5680 [9:02:18<4:43:24,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7549', 'grad_norm': '0.3848', 'learning_rate': '6.24e-05', 'ppl': '2.127', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 28983296, 'tokens/trainable': 28660560, 'epoch': '5.042'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 3538/5680 [9:02:18<4:43:24,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3539/5680 [9:02:26<4:43:27,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3411', 'grad_norm': '0.3254', 'learning_rate': '6.235e-05', 'ppl': '1.407', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 28991488, 'tokens/trainable': 28668710, 'epoch': '5.042'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3539/5680 [9:02:26<4:43:27,  7.94s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3540/5680 [9:02:34<4:46:29,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5752', 'grad_norm': '0.397', 'learning_rate': '6.229e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.7', 'tokens/total': 28999680, 'tokens/trainable': 28676864, 'epoch': '5.042'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3540/5680 [9:02:34<4:46:29,  8.03s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3541/5680 [9:02:42<4:45:59,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4447', 'grad_norm': '0.3156', 'learning_rate': '6.224e-05', 'ppl': '1.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29007872, 'tokens/trainable': 28684992, 'epoch': '5.042'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3541/5680 [9:02:42<4:45:59,  8.02s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3542/5680 [9:02:50<4:44:28,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4668', 'grad_norm': '0.4105', 'learning_rate': '6.219e-05', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 29016064, 'tokens/trainable': 28693156, 'epoch': '5.043'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 3542/5680 [9:02:50<4:44:28,  7.98s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3543/5680 [9:02:58<4:43:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4421', 'grad_norm': '0.329', 'learning_rate': '6.214e-05', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 29024256, 'tokens/trainable': 28701318, 'epoch': '5.043'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3543/5680 [9:02:58<4:43:45,  7.97s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3544/5680 [9:03:06<4:43:33,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.522', 'grad_norm': '0.3848', 'learning_rate': '6.209e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 29032448, 'tokens/trainable': 28709508, 'epoch': '5.043'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3544/5680 [9:03:06<4:43:33,  7.97s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3545/5680 [9:03:14<4:43:35,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3233', 'grad_norm': '0.3291', 'learning_rate': '6.204e-05', 'ppl': '1.382', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29040640, 'tokens/trainable': 28717684, 'epoch': '5.043'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3545/5680 [9:03:14<4:43:35,  7.97s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3546/5680 [9:03:22<4:43:17,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6804', 'grad_norm': '0.3909', 'learning_rate': '6.199e-05', 'ppl': '1.975', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29048832, 'tokens/trainable': 28725808, 'epoch': '5.043'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 3546/5680 [9:03:22<4:43:17,  7.96s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 3547/5680 [9:03:30<4:42:48,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4597', 'grad_norm': '0.3981', 'learning_rate': '6.194e-05', 'ppl': '1.584', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 29057024, 'tokens/trainable': 28733986, 'epoch': '5.043'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 3547/5680 [9:03:30<4:42:48,  7.96s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 3548/5680 [9:03:38<4:43:00,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5843', 'grad_norm': '0.3923', 'learning_rate': '6.189e-05', 'ppl': '1.794', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29065216, 'tokens/trainable': 28742154, 'epoch': '5.044'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 3548/5680 [9:03:38<4:43:00,  7.96s/it] 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 3549/5680 [9:03:46<4:42:38,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.7147', 'grad_norm': '0.3914', 'learning_rate': '6.183e-05', 'ppl': '2.043', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29073408, 'tokens/trainable': 28750232, 'epoch': '5.044'}
 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 3549/5680 [9:03:46<4:42:38,  7.96s/it] 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3550/5680 [9:03:54<4:42:56,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5218', 'grad_norm': '0.353', 'learning_rate': '6.178e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29081600, 'tokens/trainable': 28758352, 'epoch': '5.044'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3550/5680 [9:03:54<4:42:56,  7.97s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3551/5680 [9:04:02<4:42:19,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.318', 'grad_norm': '0.3573', 'learning_rate': '6.173e-05', 'ppl': '1.374', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 29089792, 'tokens/trainable': 28766520, 'epoch': '5.044'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3551/5680 [9:04:02<4:42:19,  7.96s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3552/5680 [9:04:10<4:42:02,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6664', 'grad_norm': '0.6021', 'learning_rate': '6.168e-05', 'ppl': '1.947', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 29097984, 'tokens/trainable': 28774702, 'epoch': '5.044'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3552/5680 [9:04:10<4:42:02,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3553/5680 [9:04:18<4:41:58,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4018', 'grad_norm': '0.4173', 'learning_rate': '6.163e-05', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29106176, 'tokens/trainable': 28782860, 'epoch': '5.045'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 3553/5680 [9:04:18<4:41:58,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3554/5680 [9:04:26<4:42:23,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4492', 'grad_norm': '0.4065', 'learning_rate': '6.158e-05', 'ppl': '1.567', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29114368, 'tokens/trainable': 28791036, 'epoch': '5.045'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3554/5680 [9:04:26<4:42:23,  7.97s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3555/5680 [9:04:34<4:42:12,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.594', 'grad_norm': '0.4205', 'learning_rate': '6.153e-05', 'ppl': '1.811', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29122560, 'tokens/trainable': 28799162, 'epoch': '5.045'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3555/5680 [9:04:34<4:42:12,  7.97s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3556/5680 [9:04:42<4:41:41,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.374', 'grad_norm': '0.3325', 'learning_rate': '6.148e-05', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 29130752, 'tokens/trainable': 28807332, 'epoch': '5.045'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3556/5680 [9:04:42<4:41:41,  7.96s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3557/5680 [9:04:49<4:41:20,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4617', 'grad_norm': '0.4464', 'learning_rate': '6.143e-05', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29138944, 'tokens/trainable': 28815468, 'epoch': '5.045'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 3557/5680 [9:04:49<4:41:20,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3558/5680 [9:04:57<4:41:01,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4867', 'grad_norm': '0.4429', 'learning_rate': '6.137e-05', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 29147136, 'tokens/trainable': 28823552, 'epoch': '5.045'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3558/5680 [9:04:57<4:41:01,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3559/5680 [9:05:05<4:41:55,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6031', 'grad_norm': '0.4167', 'learning_rate': '6.132e-05', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29155328, 'tokens/trainable': 28831740, 'epoch': '5.046'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3559/5680 [9:05:05<4:41:55,  7.98s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3560/5680 [9:05:13<4:41:48,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5492', 'grad_norm': '0.4216', 'learning_rate': '6.127e-05', 'ppl': '1.732', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 29163520, 'tokens/trainable': 28839910, 'epoch': '5.046'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3560/5680 [9:05:13<4:41:48,  7.98s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3561/5680 [9:05:21<4:41:27,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4432', 'grad_norm': '0.4086', 'learning_rate': '6.122e-05', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29171712, 'tokens/trainable': 28848028, 'epoch': '5.046'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 3561/5680 [9:05:21<4:41:27,  7.97s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 3562/5680 [9:05:29<4:40:28,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3718', 'grad_norm': '0.3695', 'learning_rate': '6.117e-05', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 29179904, 'tokens/trainable': 28856176, 'epoch': '5.046'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 3562/5680 [9:05:29<4:40:28,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 3563/5680 [9:05:37<4:40:27,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5188', 'grad_norm': '0.4273', 'learning_rate': '6.112e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29188096, 'tokens/trainable': 28864328, 'epoch': '5.046'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 3563/5680 [9:05:37<4:40:27,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 3564/5680 [9:05:45<4:40:48,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3926', 'grad_norm': '0.365', 'learning_rate': '6.107e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29196288, 'tokens/trainable': 28872494, 'epoch': '5.046'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 3564/5680 [9:05:45<4:40:48,  7.96s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3565/5680 [9:05:53<4:40:22,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7481', 'grad_norm': '0.4202', 'learning_rate': '6.102e-05', 'ppl': '2.113', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 29204480, 'tokens/trainable': 28880652, 'epoch': '5.047'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3565/5680 [9:05:53<4:40:22,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3566/5680 [9:06:01<4:43:08,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.5254', 'grad_norm': '0.366', 'learning_rate': '6.097e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.8', 'tokens/total': 29212672, 'tokens/trainable': 28888820, 'epoch': '5.047'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3566/5680 [9:06:01<4:43:08,  8.04s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3567/5680 [9:06:09<4:42:44,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4208', 'grad_norm': '0.347', 'learning_rate': '6.092e-05', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29220864, 'tokens/trainable': 28896952, 'epoch': '5.047'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3567/5680 [9:06:09<4:42:44,  8.03s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3568/5680 [9:06:17<4:42:02,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6079', 'grad_norm': '0.4424', 'learning_rate': '6.086e-05', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 29229056, 'tokens/trainable': 28905136, 'epoch': '5.047'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 3568/5680 [9:06:17<4:42:02,  8.01s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3569/5680 [9:06:25<4:41:16,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5202', 'grad_norm': '0.42', 'learning_rate': '6.081e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 29237248, 'tokens/trainable': 28913296, 'epoch': '5.047'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3569/5680 [9:06:25<4:41:16,  7.99s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3570/5680 [9:06:33<4:40:23,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6107', 'grad_norm': '0.3809', 'learning_rate': '6.076e-05', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 29245440, 'tokens/trainable': 28921472, 'epoch': '5.048'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3570/5680 [9:06:33<4:40:23,  7.97s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3571/5680 [9:06:41<4:39:52,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3628', 'grad_norm': '0.3755', 'learning_rate': '6.071e-05', 'ppl': '1.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 29253632, 'tokens/trainable': 28929616, 'epoch': '5.048'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3571/5680 [9:06:41<4:39:52,  7.96s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3572/5680 [9:06:49<4:39:03,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4944', 'grad_norm': '0.3456', 'learning_rate': '6.066e-05', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 29261824, 'tokens/trainable': 28937770, 'epoch': '5.048'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 3572/5680 [9:06:49<4:39:03,  7.94s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3573/5680 [9:06:57<4:39:18,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5139', 'grad_norm': '0.416', 'learning_rate': '6.061e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 29270016, 'tokens/trainable': 28945860, 'epoch': '5.048'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3573/5680 [9:06:57<4:39:18,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3574/5680 [9:07:05<4:38:51,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7382', 'grad_norm': '0.408', 'learning_rate': '6.056e-05', 'ppl': '2.092', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29278208, 'tokens/trainable': 28953976, 'epoch': '5.048'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3574/5680 [9:07:05<4:38:51,  7.94s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3575/5680 [9:07:13<4:39:05,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4442', 'grad_norm': '0.357', 'learning_rate': '6.051e-05', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29286400, 'tokens/trainable': 28962140, 'epoch': '5.048'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3575/5680 [9:07:13<4:39:05,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3576/5680 [9:07:21<4:38:25,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3566', 'grad_norm': '0.367', 'learning_rate': '6.046e-05', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 29294592, 'tokens/trainable': 28970302, 'epoch': '5.049'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3576/5680 [9:07:21<4:38:25,  7.94s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3577/5680 [9:07:29<4:38:35,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3454', 'grad_norm': '0.5199', 'learning_rate': '6.041e-05', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29302784, 'tokens/trainable': 28978434, 'epoch': '5.049'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3577/5680 [9:07:29<4:38:35,  7.95s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3578/5680 [9:07:37<4:38:07,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6159', 'grad_norm': '0.4403', 'learning_rate': '6.036e-05', 'ppl': '1.851', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 29310976, 'tokens/trainable': 28986556, 'epoch': '5.049'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3578/5680 [9:07:37<4:38:07,  7.94s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3579/5680 [9:07:45<4:38:01,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5705', 'grad_norm': '0.5208', 'learning_rate': '6.031e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29319168, 'tokens/trainable': 28994628, 'epoch': '5.049'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 3579/5680 [9:07:45<4:38:01,  7.94s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3580/5680 [9:07:53<4:37:54,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5261', 'grad_norm': '0.3805', 'learning_rate': '6.025e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 29327360, 'tokens/trainable': 29002730, 'epoch': '5.049'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3580/5680 [9:07:53<4:37:54,  7.94s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3581/5680 [9:08:01<4:38:05,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5285', 'grad_norm': '0.3617', 'learning_rate': '6.02e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29335552, 'tokens/trainable': 29010864, 'epoch': '5.049'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3581/5680 [9:08:01<4:38:05,  7.95s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3582/5680 [9:08:09<4:38:00,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3467', 'grad_norm': '0.3036', 'learning_rate': '6.015e-05', 'ppl': '1.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 29343744, 'tokens/trainable': 29019050, 'epoch': '5.05'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3582/5680 [9:08:09<4:38:00,  7.95s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3583/5680 [9:08:16<4:37:50,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5203', 'grad_norm': '0.4324', 'learning_rate': '6.01e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 29351936, 'tokens/trainable': 29027152, 'epoch': '5.05'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 3583/5680 [9:08:16<4:37:50,  7.95s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 3584/5680 [9:08:24<4:37:55,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4693', 'grad_norm': '0.3692', 'learning_rate': '6.005e-05', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29360128, 'tokens/trainable': 29035288, 'epoch': '5.05'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 3584/5680 [9:08:24<4:37:55,  7.96s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 3585/5680 [9:08:32<4:37:48,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.536', 'grad_norm': '0.388', 'learning_rate': '6e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 29368320, 'tokens/trainable': 29043400, 'epoch': '5.05'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 3585/5680 [9:08:32<4:37:48,  7.96s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 3586/5680 [9:08:40<4:37:27,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6718', 'grad_norm': '0.3734', 'learning_rate': '5.995e-05', 'ppl': '1.958', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 29376512, 'tokens/trainable': 29051572, 'epoch': '5.05'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 3586/5680 [9:08:40<4:37:27,  7.95s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3587/5680 [9:08:48<4:37:00,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4386', 'grad_norm': '0.3711', 'learning_rate': '5.99e-05', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29384704, 'tokens/trainable': 29059670, 'epoch': '5.051'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3587/5680 [9:08:48<4:37:00,  7.94s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3588/5680 [9:08:56<4:36:53,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5879', 'grad_norm': '0.4427', 'learning_rate': '5.985e-05', 'ppl': '1.8', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 29392896, 'tokens/trainable': 29067812, 'epoch': '5.051'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3588/5680 [9:08:56<4:36:53,  7.94s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3589/5680 [9:09:04<4:36:53,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6219', 'grad_norm': '0.3844', 'learning_rate': '5.98e-05', 'ppl': '1.862', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 29401088, 'tokens/trainable': 29075958, 'epoch': '5.051'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3589/5680 [9:09:04<4:36:53,  7.95s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3590/5680 [9:09:12<4:37:37,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7052', 'grad_norm': '0.3782', 'learning_rate': '5.975e-05', 'ppl': '2.024', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29409280, 'tokens/trainable': 29084148, 'epoch': '5.051'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 3590/5680 [9:09:12<4:37:37,  7.97s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3591/5680 [9:09:20<4:37:34,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5829', 'grad_norm': '0.3785', 'learning_rate': '5.97e-05', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29417472, 'tokens/trainable': 29092298, 'epoch': '5.051'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3591/5680 [9:09:20<4:37:34,  7.97s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3592/5680 [9:09:28<4:37:20,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4012', 'grad_norm': '0.4108', 'learning_rate': '5.965e-05', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 29425664, 'tokens/trainable': 29100488, 'epoch': '5.051'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3592/5680 [9:09:28<4:37:20,  7.97s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3593/5680 [9:09:36<4:37:34,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4967', 'grad_norm': '0.3535', 'learning_rate': '5.96e-05', 'ppl': '1.643', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29433856, 'tokens/trainable': 29108614, 'epoch': '5.052'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3593/5680 [9:09:36<4:37:34,  7.98s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3594/5680 [9:09:44<4:37:34,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3844', 'grad_norm': '0.409', 'learning_rate': '5.955e-05', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29442048, 'tokens/trainable': 29116738, 'epoch': '5.052'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 3594/5680 [9:09:44<4:37:34,  7.98s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3595/5680 [9:09:52<4:38:10,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5834', 'grad_norm': '0.4306', 'learning_rate': '5.949e-05', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 29450240, 'tokens/trainable': 29124886, 'epoch': '5.052'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3595/5680 [9:09:52<4:38:10,  8.01s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3596/5680 [9:10:00<4:37:57,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5681', 'grad_norm': '0.4725', 'learning_rate': '5.944e-05', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 29458432, 'tokens/trainable': 29132988, 'epoch': '5.052'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3596/5680 [9:10:00<4:37:57,  8.00s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3597/5680 [9:10:08<4:37:39,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3165', 'grad_norm': '0.4034', 'learning_rate': '5.939e-05', 'ppl': '1.372', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29466624, 'tokens/trainable': 29141092, 'epoch': '5.052'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3597/5680 [9:10:08<4:37:39,  8.00s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3598/5680 [9:10:16<4:37:33,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3737', 'grad_norm': '0.4268', 'learning_rate': '5.934e-05', 'ppl': '1.453', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 29474816, 'tokens/trainable': 29149228, 'epoch': '5.052'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3598/5680 [9:10:16<4:37:33,  8.00s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 3599/5680 [9:10:24<4:37:09,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6139', 'grad_norm': '0.3509', 'learning_rate': '5.929e-05', 'ppl': '1.848', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29483008, 'tokens/trainable': 29157316, 'epoch': '5.053'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 3599/5680 [9:10:24<4:37:09,  7.99s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 3600/5680 [9:10:32<4:36:39,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4788', 'grad_norm': '0.4114', 'learning_rate': '5.924e-05', 'ppl': '1.614', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 29491200, 'tokens/trainable': 29165460, 'epoch': '5.053'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 3600/5680 [9:10:32<4:36:39,  7.98s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 3601/5680 [9:10:40<4:36:15,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4036', 'grad_norm': '0.3539', 'learning_rate': '5.919e-05', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 29499392, 'tokens/trainable': 29173628, 'epoch': '5.053'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 3601/5680 [9:10:40<4:36:15,  7.97s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3602/5680 [9:10:48<4:36:06,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5871', 'grad_norm': '0.3502', 'learning_rate': '5.914e-05', 'ppl': '1.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 29507584, 'tokens/trainable': 29181724, 'epoch': '5.053'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3602/5680 [9:10:48<4:36:06,  7.97s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3603/5680 [9:10:56<4:35:48,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5266', 'grad_norm': '0.366', 'learning_rate': '5.909e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 29515776, 'tokens/trainable': 29189838, 'epoch': '5.053'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3603/5680 [9:10:56<4:35:48,  7.97s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3604/5680 [9:11:04<4:36:17,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7006', 'grad_norm': '0.3924', 'learning_rate': '5.904e-05', 'ppl': '2.015', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29523968, 'tokens/trainable': 29198002, 'epoch': '5.054'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3604/5680 [9:11:04<4:36:17,  7.99s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3605/5680 [9:11:12<4:37:21,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4964', 'grad_norm': '0.3748', 'learning_rate': '5.899e-05', 'ppl': '1.643', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 29532160, 'tokens/trainable': 29206172, 'epoch': '5.054'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 3605/5680 [9:11:12<4:37:21,  8.02s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3606/5680 [9:11:20<4:37:34,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5121', 'grad_norm': '0.3772', 'learning_rate': '5.894e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 29540352, 'tokens/trainable': 29214312, 'epoch': '5.054'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3606/5680 [9:11:20<4:37:34,  8.03s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3607/5680 [9:11:28<4:37:54,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.5929', 'grad_norm': '0.382', 'learning_rate': '5.889e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 29548544, 'tokens/trainable': 29222420, 'epoch': '5.054'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3607/5680 [9:11:28<4:37:54,  8.04s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3608/5680 [9:11:36<4:36:56,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4299', 'grad_norm': '0.3172', 'learning_rate': '5.884e-05', 'ppl': '1.537', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 29556736, 'tokens/trainable': 29230596, 'epoch': '5.054'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3608/5680 [9:11:36<4:36:56,  8.02s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3609/5680 [9:11:45<4:39:59,  8.11s/it]                                                                                                                                                                                                                                             {'loss': '0.643', 'grad_norm': '0.4261', 'learning_rate': '5.879e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.6', 'tokens/total': 29564928, 'tokens/trainable': 29238684, 'epoch': '5.054'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                      | 3609/5680 [9:11:45<4:39:59,  8.11s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 3610/5680 [9:11:53<4:38:45,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.4656', 'grad_norm': '0.4186', 'learning_rate': '5.874e-05', 'ppl': '1.593', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29573120, 'tokens/trainable': 29246852, 'epoch': '5.055'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 3610/5680 [9:11:53<4:38:45,  8.08s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 3611/5680 [9:12:01<4:37:35,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.6472', 'grad_norm': '0.3885', 'learning_rate': '5.869e-05', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29581312, 'tokens/trainable': 29255012, 'epoch': '5.055'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 3611/5680 [9:12:01<4:37:35,  8.05s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 3612/5680 [9:12:08<4:36:30,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.3479', 'grad_norm': '0.392', 'learning_rate': '5.864e-05', 'ppl': '1.416', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 29589504, 'tokens/trainable': 29263184, 'epoch': '5.055'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 3612/5680 [9:12:08<4:36:30,  8.02s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3613/5680 [9:12:16<4:36:07,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4612', 'grad_norm': '0.4187', 'learning_rate': '5.859e-05', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 29597696, 'tokens/trainable': 29271336, 'epoch': '5.055'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3613/5680 [9:12:16<4:36:07,  8.02s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3614/5680 [9:12:24<4:35:38,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6592', 'grad_norm': '0.4084', 'learning_rate': '5.854e-05', 'ppl': '1.933', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29605888, 'tokens/trainable': 29279486, 'epoch': '5.055'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3614/5680 [9:12:24<4:35:38,  8.01s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3615/5680 [9:12:32<4:35:23,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5315', 'grad_norm': '0.4412', 'learning_rate': '5.849e-05', 'ppl': '1.701', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 29614080, 'tokens/trainable': 29287666, 'epoch': '5.055'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3615/5680 [9:12:32<4:35:23,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3616/5680 [9:12:40<4:35:24,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5367', 'grad_norm': '0.3674', 'learning_rate': '5.844e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 29622272, 'tokens/trainable': 29295844, 'epoch': '5.056'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 3616/5680 [9:12:40<4:35:24,  8.01s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3617/5680 [9:12:48<4:34:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5441', 'grad_norm': '0.4267', 'learning_rate': '5.839e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29630464, 'tokens/trainable': 29303982, 'epoch': '5.056'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3617/5680 [9:12:48<4:34:53,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3618/5680 [9:12:56<4:34:53,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5257', 'grad_norm': '0.3375', 'learning_rate': '5.834e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 29638656, 'tokens/trainable': 29312098, 'epoch': '5.056'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3618/5680 [9:12:56<4:34:53,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3619/5680 [9:13:04<4:35:07,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4612', 'grad_norm': '0.3736', 'learning_rate': '5.828e-05', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 29646848, 'tokens/trainable': 29320252, 'epoch': '5.056'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3619/5680 [9:13:04<4:35:07,  8.01s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3620/5680 [9:13:12<4:34:51,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4992', 'grad_norm': '0.3939', 'learning_rate': '5.823e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29655040, 'tokens/trainable': 29328428, 'epoch': '5.056'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 3620/5680 [9:13:12<4:34:51,  8.01s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3621/5680 [9:13:20<4:34:41,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3798', 'grad_norm': '0.3853', 'learning_rate': '5.818e-05', 'ppl': '1.462', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29663232, 'tokens/trainable': 29336550, 'epoch': '5.057'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3621/5680 [9:13:20<4:34:41,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3622/5680 [9:13:28<4:34:13,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6727', 'grad_norm': '0.3802', 'learning_rate': '5.813e-05', 'ppl': '1.959', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29671424, 'tokens/trainable': 29344716, 'epoch': '5.057'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3622/5680 [9:13:28<4:34:13,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3623/5680 [9:13:36<4:34:08,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3386', 'grad_norm': '0.4657', 'learning_rate': '5.808e-05', 'ppl': '1.403', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29679616, 'tokens/trainable': 29352888, 'epoch': '5.057'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3623/5680 [9:13:36<4:34:08,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3624/5680 [9:13:44<4:33:39,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4084', 'grad_norm': '0.3204', 'learning_rate': '5.803e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 29687808, 'tokens/trainable': 29361068, 'epoch': '5.057'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3624/5680 [9:13:44<4:33:39,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3625/5680 [9:13:52<4:33:25,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6048', 'grad_norm': '0.4101', 'learning_rate': '5.798e-05', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29696000, 'tokens/trainable': 29369216, 'epoch': '5.057'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3625/5680 [9:13:52<4:33:25,  7.98s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3626/5680 [9:14:00<4:33:39,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6136', 'grad_norm': '0.3667', 'learning_rate': '5.793e-05', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29704192, 'tokens/trainable': 29377400, 'epoch': '5.057'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3626/5680 [9:14:00<4:33:39,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3627/5680 [9:14:08<4:33:44,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4497', 'grad_norm': '0.4071', 'learning_rate': '5.788e-05', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 29712384, 'tokens/trainable': 29385490, 'epoch': '5.058'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 3627/5680 [9:14:08<4:33:44,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3628/5680 [9:14:16<4:33:32,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4247', 'grad_norm': '0.3229', 'learning_rate': '5.783e-05', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 29720576, 'tokens/trainable': 29393632, 'epoch': '5.058'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3628/5680 [9:14:16<4:33:32,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3629/5680 [9:14:24<4:33:26,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3668', 'grad_norm': '0.3724', 'learning_rate': '5.778e-05', 'ppl': '1.443', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 29728768, 'tokens/trainable': 29401820, 'epoch': '5.058'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3629/5680 [9:14:24<4:33:26,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3630/5680 [9:14:32<4:33:18,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5609', 'grad_norm': '0.3703', 'learning_rate': '5.773e-05', 'ppl': '1.752', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 29736960, 'tokens/trainable': 29409944, 'epoch': '5.058'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3630/5680 [9:14:32<4:33:18,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3631/5680 [9:14:40<4:32:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4255', 'grad_norm': '0.3291', 'learning_rate': '5.768e-05', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 29745152, 'tokens/trainable': 29418120, 'epoch': '5.058'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3631/5680 [9:14:40<4:32:53,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3632/5680 [9:14:48<4:32:48,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5954', 'grad_norm': '0.3921', 'learning_rate': '5.763e-05', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 29753344, 'tokens/trainable': 29426246, 'epoch': '5.058'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3632/5680 [9:14:48<4:32:48,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3633/5680 [9:14:56<4:32:23,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4263', 'grad_norm': '0.3325', 'learning_rate': '5.758e-05', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29761536, 'tokens/trainable': 29434392, 'epoch': '5.059'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3633/5680 [9:14:56<4:32:23,  7.98s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3634/5680 [9:15:04<4:32:30,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3344', 'grad_norm': '0.339', 'learning_rate': '5.753e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29769728, 'tokens/trainable': 29442520, 'epoch': '5.059'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3634/5680 [9:15:04<4:32:30,  7.99s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3635/5680 [9:15:12<4:32:57,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3919', 'grad_norm': '0.397', 'learning_rate': '5.748e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29777920, 'tokens/trainable': 29450688, 'epoch': '5.059'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3635/5680 [9:15:12<4:32:57,  8.01s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3636/5680 [9:15:20<4:32:40,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.843', 'grad_norm': '0.4422', 'learning_rate': '5.743e-05', 'ppl': '2.323', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29786112, 'tokens/trainable': 29458824, 'epoch': '5.059'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3636/5680 [9:15:20<4:32:40,  8.00s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3637/5680 [9:15:28<4:32:44,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6214', 'grad_norm': '0.4525', 'learning_rate': '5.738e-05', 'ppl': '1.862', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29794304, 'tokens/trainable': 29466988, 'epoch': '5.059'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3637/5680 [9:15:28<4:32:44,  8.01s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3638/5680 [9:15:36<4:32:28,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7614', 'grad_norm': '0.3755', 'learning_rate': '5.733e-05', 'ppl': '2.141', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 29802496, 'tokens/trainable': 29475116, 'epoch': '5.06'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3638/5680 [9:15:36<4:32:28,  8.01s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3639/5680 [9:15:44<4:31:43,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4591', 'grad_norm': '0.3741', 'learning_rate': '5.728e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29810688, 'tokens/trainable': 29483256, 'epoch': '5.06'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3639/5680 [9:15:44<4:31:43,  7.99s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3640/5680 [9:15:52<4:31:34,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5702', 'grad_norm': '0.4239', 'learning_rate': '5.723e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29818880, 'tokens/trainable': 29491442, 'epoch': '5.06'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3640/5680 [9:15:52<4:31:34,  7.99s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3641/5680 [9:16:00<4:31:19,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4907', 'grad_norm': '0.3745', 'learning_rate': '5.718e-05', 'ppl': '1.634', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29827072, 'tokens/trainable': 29499584, 'epoch': '5.06'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3641/5680 [9:16:00<4:31:19,  7.98s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3642/5680 [9:16:08<4:30:51,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6124', 'grad_norm': '0.4258', 'learning_rate': '5.713e-05', 'ppl': '1.845', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29835264, 'tokens/trainable': 29507668, 'epoch': '5.06'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3642/5680 [9:16:08<4:30:51,  7.97s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3643/5680 [9:16:16<4:31:08,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4856', 'grad_norm': '0.5492', 'learning_rate': '5.708e-05', 'ppl': '1.625', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29843456, 'tokens/trainable': 29515828, 'epoch': '5.06'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3643/5680 [9:16:16<4:31:08,  7.99s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3644/5680 [9:16:24<4:31:00,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3168', 'grad_norm': '0.3838', 'learning_rate': '5.703e-05', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29851648, 'tokens/trainable': 29524000, 'epoch': '5.061'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3644/5680 [9:16:24<4:31:00,  7.99s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3645/5680 [9:16:32<4:31:02,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5758', 'grad_norm': '0.4139', 'learning_rate': '5.698e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 29859840, 'tokens/trainable': 29532106, 'epoch': '5.061'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3645/5680 [9:16:32<4:31:02,  7.99s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3646/5680 [9:16:40<4:30:23,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7346', 'grad_norm': '0.4489', 'learning_rate': '5.693e-05', 'ppl': '2.085', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29868032, 'tokens/trainable': 29540208, 'epoch': '5.061'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3646/5680 [9:16:40<4:30:23,  7.98s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3647/5680 [9:16:48<4:30:16,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4664', 'grad_norm': '0.451', 'learning_rate': '5.688e-05', 'ppl': '1.594', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 29876224, 'tokens/trainable': 29548372, 'epoch': '5.061'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3647/5680 [9:16:48<4:30:16,  7.98s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3648/5680 [9:16:56<4:30:09,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8312', 'grad_norm': '0.4419', 'learning_rate': '5.683e-05', 'ppl': '2.296', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 29884416, 'tokens/trainable': 29556520, 'epoch': '5.061'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3648/5680 [9:16:56<4:30:09,  7.98s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3649/5680 [9:17:04<4:29:48,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3339', 'grad_norm': '0.353', 'learning_rate': '5.678e-05', 'ppl': '1.396', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29892608, 'tokens/trainable': 29564654, 'epoch': '5.061'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3649/5680 [9:17:04<4:29:48,  7.97s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3650/5680 [9:17:12<4:30:01,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3883', 'grad_norm': '0.3775', 'learning_rate': '5.673e-05', 'ppl': '1.474', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29900800, 'tokens/trainable': 29572802, 'epoch': '5.062'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3650/5680 [9:17:12<4:30:01,  7.98s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3651/5680 [9:17:20<4:29:51,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4763', 'grad_norm': '0.3966', 'learning_rate': '5.668e-05', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 29908992, 'tokens/trainable': 29580960, 'epoch': '5.062'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3651/5680 [9:17:20<4:29:51,  7.98s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3652/5680 [9:17:28<4:32:31,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.4133', 'grad_norm': '0.4354', 'learning_rate': '5.663e-05', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.9', 'tokens/total': 29917184, 'tokens/trainable': 29589148, 'epoch': '5.062'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3652/5680 [9:17:28<4:32:31,  8.06s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3653/5680 [9:17:36<4:31:20,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.8012', 'grad_norm': '0.4014', 'learning_rate': '5.658e-05', 'ppl': '2.228', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 29925376, 'tokens/trainable': 29597226, 'epoch': '5.062'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3653/5680 [9:17:36<4:31:20,  8.03s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3654/5680 [9:17:44<4:30:19,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4182', 'grad_norm': '0.3922', 'learning_rate': '5.653e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 29933568, 'tokens/trainable': 29605396, 'epoch': '5.062'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3654/5680 [9:17:44<4:30:19,  8.01s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3655/5680 [9:17:52<4:30:18,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.355', 'grad_norm': '0.367', 'learning_rate': '5.648e-05', 'ppl': '1.426', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 29941760, 'tokens/trainable': 29613504, 'epoch': '5.062'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3655/5680 [9:17:52<4:30:18,  8.01s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3656/5680 [9:18:00<4:30:04,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6792', 'grad_norm': '0.3791', 'learning_rate': '5.643e-05', 'ppl': '1.972', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 29949952, 'tokens/trainable': 29621644, 'epoch': '5.063'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3656/5680 [9:18:00<4:30:04,  8.01s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3657/5680 [9:18:08<4:29:28,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3929', 'grad_norm': '0.321', 'learning_rate': '5.638e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 29958144, 'tokens/trainable': 29629716, 'epoch': '5.063'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3657/5680 [9:18:08<4:29:28,  7.99s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3658/5680 [9:18:17<4:32:28,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.3941', 'grad_norm': '0.3842', 'learning_rate': '5.633e-05', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.5', 'tokens/total': 29966336, 'tokens/trainable': 29637896, 'epoch': '5.063'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3658/5680 [9:18:17<4:32:28,  8.09s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3659/5680 [9:18:25<4:31:47,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.5525', 'grad_norm': '0.3661', 'learning_rate': '5.628e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 29974528, 'tokens/trainable': 29646052, 'epoch': '5.063'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3659/5680 [9:18:25<4:31:47,  8.07s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3660/5680 [9:18:33<4:30:47,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.3738', 'grad_norm': '0.352', 'learning_rate': '5.623e-05', 'ppl': '1.453', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 29982720, 'tokens/trainable': 29654208, 'epoch': '5.063'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3660/5680 [9:18:33<4:30:47,  8.04s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3661/5680 [9:18:41<4:30:34,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.3271', 'grad_norm': '0.3267', 'learning_rate': '5.618e-05', 'ppl': '1.387', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 29990912, 'tokens/trainable': 29662372, 'epoch': '5.064'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3661/5680 [9:18:41<4:30:34,  8.04s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3662/5680 [9:18:49<4:29:47,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.7314', 'grad_norm': '0.414', 'learning_rate': '5.614e-05', 'ppl': '2.078', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 29999104, 'tokens/trainable': 29670548, 'epoch': '5.064'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3662/5680 [9:18:49<4:29:47,  8.02s/it] 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3663/5680 [9:18:57<4:29:27,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5806', 'grad_norm': '0.344', 'learning_rate': '5.609e-05', 'ppl': '1.787', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30007296, 'tokens/trainable': 29678724, 'epoch': '5.064'}
 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3663/5680 [9:18:57<4:29:27,  8.02s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3664/5680 [9:19:05<4:28:48,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5045', 'grad_norm': '0.3724', 'learning_rate': '5.604e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30015488, 'tokens/trainable': 29686878, 'epoch': '5.064'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 3664/5680 [9:19:05<4:28:48,  8.00s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3665/5680 [9:19:13<4:28:57,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3801', 'grad_norm': '0.3576', 'learning_rate': '5.599e-05', 'ppl': '1.462', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 30023680, 'tokens/trainable': 29694960, 'epoch': '5.064'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3665/5680 [9:19:13<4:28:57,  8.01s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3666/5680 [9:19:21<4:28:08,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4052', 'grad_norm': '0.3551', 'learning_rate': '5.594e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 30031872, 'tokens/trainable': 29703112, 'epoch': '5.064'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3666/5680 [9:19:21<4:28:08,  7.99s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3667/5680 [9:19:29<4:28:13,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.456', 'grad_norm': '0.3574', 'learning_rate': '5.589e-05', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30040064, 'tokens/trainable': 29711274, 'epoch': '5.065'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3667/5680 [9:19:29<4:28:13,  7.99s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3668/5680 [9:19:37<4:28:17,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.8271', 'grad_norm': '0.4614', 'learning_rate': '5.584e-05', 'ppl': '2.287', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 30048256, 'tokens/trainable': 29719374, 'epoch': '5.065'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3668/5680 [9:19:37<4:28:17,  8.00s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3669/5680 [9:19:45<4:28:05,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6973', 'grad_norm': '0.4496', 'learning_rate': '5.579e-05', 'ppl': '2.008', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30056448, 'tokens/trainable': 29727540, 'epoch': '5.065'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3669/5680 [9:19:45<4:28:05,  8.00s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3670/5680 [9:19:52<4:27:42,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4867', 'grad_norm': '0.4004', 'learning_rate': '5.574e-05', 'ppl': '1.627', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30064640, 'tokens/trainable': 29735704, 'epoch': '5.065'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3670/5680 [9:19:53<4:27:42,  7.99s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3671/5680 [9:20:00<4:27:37,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4863', 'grad_norm': '0.377', 'learning_rate': '5.569e-05', 'ppl': '1.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30072832, 'tokens/trainable': 29743832, 'epoch': '5.065'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3671/5680 [9:20:00<4:27:37,  7.99s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3672/5680 [9:20:08<4:27:02,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5504', 'grad_norm': '0.3619', 'learning_rate': '5.564e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 30081024, 'tokens/trainable': 29751936, 'epoch': '5.065'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 3672/5680 [9:20:08<4:27:02,  7.98s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 3673/5680 [9:20:16<4:27:05,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3592', 'grad_norm': '0.403', 'learning_rate': '5.559e-05', 'ppl': '1.432', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30089216, 'tokens/trainable': 29760068, 'epoch': '5.066'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 3673/5680 [9:20:16<4:27:05,  7.99s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 3674/5680 [9:20:24<4:26:42,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5268', 'grad_norm': '0.3817', 'learning_rate': '5.554e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30097408, 'tokens/trainable': 29768208, 'epoch': '5.066'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 3674/5680 [9:20:24<4:26:42,  7.98s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 3675/5680 [9:20:32<4:26:18,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4716', 'grad_norm': '0.4166', 'learning_rate': '5.549e-05', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30105600, 'tokens/trainable': 29776320, 'epoch': '5.066'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 3675/5680 [9:20:32<4:26:18,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3676/5680 [9:20:40<4:26:12,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6429', 'grad_norm': '0.4114', 'learning_rate': '5.544e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30113792, 'tokens/trainable': 29784486, 'epoch': '5.066'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3676/5680 [9:20:40<4:26:12,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3677/5680 [9:20:48<4:26:35,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5076', 'grad_norm': '0.4959', 'learning_rate': '5.539e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30121984, 'tokens/trainable': 29792670, 'epoch': '5.066'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3677/5680 [9:20:48<4:26:35,  7.99s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3678/5680 [9:20:56<4:26:09,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6686', 'grad_norm': '0.3947', 'learning_rate': '5.534e-05', 'ppl': '1.952', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30130176, 'tokens/trainable': 29800836, 'epoch': '5.067'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3678/5680 [9:20:56<4:26:09,  7.98s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3679/5680 [9:21:04<4:26:09,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5717', 'grad_norm': '0.4885', 'learning_rate': '5.529e-05', 'ppl': '1.771', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30138368, 'tokens/trainable': 29808944, 'epoch': '5.067'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 3679/5680 [9:21:04<4:26:09,  7.98s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3680/5680 [9:21:12<4:25:58,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.398', 'grad_norm': '0.3603', 'learning_rate': '5.524e-05', 'ppl': '1.489', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30146560, 'tokens/trainable': 29817108, 'epoch': '5.067'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3680/5680 [9:21:12<4:25:58,  7.98s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3681/5680 [9:21:20<4:25:30,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3831', 'grad_norm': '0.374', 'learning_rate': '5.519e-05', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30154752, 'tokens/trainable': 29825220, 'epoch': '5.067'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3681/5680 [9:21:20<4:25:30,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3682/5680 [9:21:28<4:25:13,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4778', 'grad_norm': '0.3402', 'learning_rate': '5.514e-05', 'ppl': '1.612', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30162944, 'tokens/trainable': 29833348, 'epoch': '5.067'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3682/5680 [9:21:28<4:25:13,  7.96s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3683/5680 [9:21:36<4:25:12,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5085', 'grad_norm': '0.4116', 'learning_rate': '5.509e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 30171136, 'tokens/trainable': 29841486, 'epoch': '5.067'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 3683/5680 [9:21:36<4:25:12,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3684/5680 [9:21:44<4:24:55,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5145', 'grad_norm': '0.4161', 'learning_rate': '5.505e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 30179328, 'tokens/trainable': 29849552, 'epoch': '5.068'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3684/5680 [9:21:44<4:24:55,  7.96s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3685/5680 [9:21:52<4:25:02,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.2661', 'grad_norm': '0.4063', 'learning_rate': '5.5e-05', 'ppl': '1.305', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30187520, 'tokens/trainable': 29857708, 'epoch': '5.068'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3685/5680 [9:21:52<4:25:02,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3686/5680 [9:22:00<4:24:49,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6492', 'grad_norm': '0.4323', 'learning_rate': '5.495e-05', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30195712, 'tokens/trainable': 29865864, 'epoch': '5.068'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3686/5680 [9:22:00<4:24:49,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3687/5680 [9:22:08<4:24:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5054', 'grad_norm': '0.3992', 'learning_rate': '5.49e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30203904, 'tokens/trainable': 29874024, 'epoch': '5.068'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3687/5680 [9:22:08<4:24:45,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3688/5680 [9:22:16<4:24:30,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4254', 'grad_norm': '0.3651', 'learning_rate': '5.485e-05', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30212096, 'tokens/trainable': 29882156, 'epoch': '5.068'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3688/5680 [9:22:16<4:24:30,  7.97s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3689/5680 [9:22:24<4:24:10,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6601', 'grad_norm': '0.3747', 'learning_rate': '5.48e-05', 'ppl': '1.935', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30220288, 'tokens/trainable': 29890268, 'epoch': '5.068'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3689/5680 [9:22:24<4:24:10,  7.96s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3690/5680 [9:22:32<4:24:54,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5202', 'grad_norm': '0.4702', 'learning_rate': '5.475e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 30228480, 'tokens/trainable': 29898358, 'epoch': '5.069'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3690/5680 [9:22:32<4:24:54,  7.99s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3691/5680 [9:22:40<4:25:53,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.48', 'grad_norm': '0.4069', 'learning_rate': '5.47e-05', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 30236672, 'tokens/trainable': 29906534, 'epoch': '5.069'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3691/5680 [9:22:40<4:25:53,  8.02s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3692/5680 [9:22:48<4:25:21,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7699', 'grad_norm': '0.4326', 'learning_rate': '5.465e-05', 'ppl': '2.16', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30244864, 'tokens/trainable': 29914716, 'epoch': '5.069'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3692/5680 [9:22:48<4:25:21,  8.01s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3693/5680 [9:22:56<4:24:52,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4645', 'grad_norm': '0.375', 'learning_rate': '5.46e-05', 'ppl': '1.591', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 30253056, 'tokens/trainable': 29922900, 'epoch': '5.069'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3693/5680 [9:22:56<4:24:52,  8.00s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3694/5680 [9:23:04<4:24:41,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4234', 'grad_norm': '0.3484', 'learning_rate': '5.455e-05', 'ppl': '1.527', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30261248, 'tokens/trainable': 29931082, 'epoch': '5.069'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 3694/5680 [9:23:04<4:24:41,  8.00s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 3695/5680 [9:23:12<4:27:29,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.5621', 'grad_norm': '0.4256', 'learning_rate': '5.45e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.7', 'tokens/total': 30269440, 'tokens/trainable': 29939196, 'epoch': '5.07'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 3695/5680 [9:23:12<4:27:29,  8.09s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 3696/5680 [9:23:20<4:26:17,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.352', 'grad_norm': '0.3491', 'learning_rate': '5.445e-05', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30277632, 'tokens/trainable': 29947362, 'epoch': '5.07'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 3696/5680 [9:23:20<4:26:17,  8.05s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 3697/5680 [9:23:28<4:25:35,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.4257', 'grad_norm': '0.4039', 'learning_rate': '5.44e-05', 'ppl': '1.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 30285824, 'tokens/trainable': 29955440, 'epoch': '5.07'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 3697/5680 [9:23:28<4:25:35,  8.04s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3698/5680 [9:23:36<4:24:27,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.2731', 'grad_norm': '0.3483', 'learning_rate': '5.435e-05', 'ppl': '1.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 30294016, 'tokens/trainable': 29963604, 'epoch': '5.07'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3698/5680 [9:23:36<4:24:27,  8.01s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3699/5680 [9:23:44<4:24:26,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3578', 'grad_norm': '0.3114', 'learning_rate': '5.431e-05', 'ppl': '1.43', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 30302208, 'tokens/trainable': 29971688, 'epoch': '5.07'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3699/5680 [9:23:44<4:24:26,  8.01s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3700/5680 [9:23:52<4:24:47,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4004', 'grad_norm': '0.373', 'learning_rate': '5.426e-05', 'ppl': '1.492', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30310400, 'tokens/trainable': 29979864, 'epoch': '5.07'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3700/5680 [9:23:52<4:24:47,  8.02s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3701/5680 [9:24:00<4:23:58,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.7133', 'grad_norm': '0.4224', 'learning_rate': '5.421e-05', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 30318592, 'tokens/trainable': 29988032, 'epoch': '5.071'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 3701/5680 [9:24:00<4:23:58,  8.00s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3702/5680 [9:24:08<4:23:46,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4731', 'grad_norm': '0.3553', 'learning_rate': '5.416e-05', 'ppl': '1.605', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 30326784, 'tokens/trainable': 29996108, 'epoch': '5.071'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3702/5680 [9:24:08<4:23:46,  8.00s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3703/5680 [9:24:16<4:23:07,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3445', 'grad_norm': '0.343', 'learning_rate': '5.411e-05', 'ppl': '1.411', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 30334976, 'tokens/trainable': 30004284, 'epoch': '5.071'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3703/5680 [9:24:16<4:23:07,  7.99s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3704/5680 [9:24:24<4:22:51,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.324', 'grad_norm': '0.3474', 'learning_rate': '5.406e-05', 'ppl': '1.383', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30343168, 'tokens/trainable': 30012404, 'epoch': '5.071'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3704/5680 [9:24:24<4:22:51,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3705/5680 [9:24:32<4:22:40,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5885', 'grad_norm': '0.4656', 'learning_rate': '5.401e-05', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30351360, 'tokens/trainable': 30020548, 'epoch': '5.071'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 3705/5680 [9:24:32<4:22:40,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3706/5680 [9:24:40<4:22:40,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.666', 'grad_norm': '0.4153', 'learning_rate': '5.396e-05', 'ppl': '1.946', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30359552, 'tokens/trainable': 30028692, 'epoch': '5.071'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3706/5680 [9:24:40<4:22:40,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3707/5680 [9:24:48<4:22:30,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7119', 'grad_norm': '0.4384', 'learning_rate': '5.391e-05', 'ppl': '2.038', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30367744, 'tokens/trainable': 30036876, 'epoch': '5.072'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3707/5680 [9:24:48<4:22:30,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3708/5680 [9:24:56<4:22:33,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3766', 'grad_norm': '0.4236', 'learning_rate': '5.386e-05', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 30375936, 'tokens/trainable': 30044990, 'epoch': '5.072'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3708/5680 [9:24:56<4:22:33,  7.99s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3709/5680 [9:25:04<4:22:35,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5197', 'grad_norm': '0.4234', 'learning_rate': '5.381e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 30384128, 'tokens/trainable': 30053104, 'epoch': '5.072'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 3709/5680 [9:25:04<4:22:35,  7.99s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 3710/5680 [9:25:12<4:21:59,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3536', 'grad_norm': '0.3874', 'learning_rate': '5.377e-05', 'ppl': '1.424', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 30392320, 'tokens/trainable': 30061178, 'epoch': '5.072'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 3710/5680 [9:25:12<4:21:59,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 3711/5680 [9:25:20<4:22:24,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3723', 'grad_norm': '0.3786', 'learning_rate': '5.372e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30400512, 'tokens/trainable': 30069334, 'epoch': '5.072'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 3711/5680 [9:25:20<4:22:24,  8.00s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 3712/5680 [9:25:28<4:21:36,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.653', 'grad_norm': '0.4377', 'learning_rate': '5.367e-05', 'ppl': '1.921', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30408704, 'tokens/trainable': 30077446, 'epoch': '5.073'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 3712/5680 [9:25:28<4:21:36,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3713/5680 [9:25:36<4:21:39,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5078', 'grad_norm': '0.3819', 'learning_rate': '5.362e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 30416896, 'tokens/trainable': 30085522, 'epoch': '5.073'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3713/5680 [9:25:36<4:21:39,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3714/5680 [9:25:44<4:21:23,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4841', 'grad_norm': '0.3766', 'learning_rate': '5.357e-05', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30425088, 'tokens/trainable': 30093696, 'epoch': '5.073'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3714/5680 [9:25:44<4:21:23,  7.98s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3715/5680 [9:25:52<4:21:00,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.547', 'grad_norm': '0.3478', 'learning_rate': '5.352e-05', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 30433280, 'tokens/trainable': 30101802, 'epoch': '5.073'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3715/5680 [9:25:52<4:21:00,  7.97s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3716/5680 [9:26:00<4:20:56,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3123', 'grad_norm': '0.3646', 'learning_rate': '5.347e-05', 'ppl': '1.367', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 30441472, 'tokens/trainable': 30109938, 'epoch': '5.073'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3716/5680 [9:26:00<4:20:56,  7.97s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3717/5680 [9:26:08<4:20:28,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4378', 'grad_norm': '0.4225', 'learning_rate': '5.342e-05', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 30449664, 'tokens/trainable': 30118114, 'epoch': '5.073'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3717/5680 [9:26:08<4:20:28,  7.96s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3718/5680 [9:26:16<4:20:46,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3093', 'grad_norm': '0.3365', 'learning_rate': '5.337e-05', 'ppl': '1.362', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30457856, 'tokens/trainable': 30126300, 'epoch': '5.074'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3718/5680 [9:26:16<4:20:46,  7.97s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3719/5680 [9:26:24<4:20:37,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5111', 'grad_norm': '0.4482', 'learning_rate': '5.332e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30466048, 'tokens/trainable': 30134456, 'epoch': '5.074'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3719/5680 [9:26:24<4:20:37,  7.97s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3720/5680 [9:26:32<4:20:19,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5694', 'grad_norm': '0.3997', 'learning_rate': '5.328e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 30474240, 'tokens/trainable': 30142640, 'epoch': '5.074'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 3720/5680 [9:26:32<4:20:19,  7.97s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 3721/5680 [9:26:40<4:20:40,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5042', 'grad_norm': '0.3857', 'learning_rate': '5.323e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30482432, 'tokens/trainable': 30150780, 'epoch': '5.074'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 3721/5680 [9:26:40<4:20:40,  7.98s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 3722/5680 [9:26:48<4:20:40,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6272', 'grad_norm': '0.3812', 'learning_rate': '5.318e-05', 'ppl': '1.872', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 30490624, 'tokens/trainable': 30158856, 'epoch': '5.074'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 3722/5680 [9:26:48<4:20:40,  7.99s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 3723/5680 [9:26:56<4:20:17,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4806', 'grad_norm': '0.3947', 'learning_rate': '5.313e-05', 'ppl': '1.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 30498816, 'tokens/trainable': 30166972, 'epoch': '5.074'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 3723/5680 [9:26:56<4:20:17,  7.98s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3724/5680 [9:27:04<4:19:31,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5855', 'grad_norm': '0.3921', 'learning_rate': '5.308e-05', 'ppl': '1.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 30507008, 'tokens/trainable': 30175154, 'epoch': '5.075'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3724/5680 [9:27:04<4:19:31,  7.96s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3725/5680 [9:27:12<4:19:12,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4141', 'grad_norm': '0.3736', 'learning_rate': '5.303e-05', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 30515200, 'tokens/trainable': 30183308, 'epoch': '5.075'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3725/5680 [9:27:12<4:19:12,  7.96s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3726/5680 [9:27:20<4:19:22,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6071', 'grad_norm': '0.4011', 'learning_rate': '5.298e-05', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30523392, 'tokens/trainable': 30191420, 'epoch': '5.075'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3726/5680 [9:27:20<4:19:22,  7.96s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3727/5680 [9:27:28<4:19:03,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4291', 'grad_norm': '0.3783', 'learning_rate': '5.293e-05', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30531584, 'tokens/trainable': 30199548, 'epoch': '5.075'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 3727/5680 [9:27:28<4:19:03,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3728/5680 [9:27:36<4:19:03,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5809', 'grad_norm': '0.3559', 'learning_rate': '5.288e-05', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30539776, 'tokens/trainable': 30207724, 'epoch': '5.075'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3728/5680 [9:27:36<4:19:03,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3729/5680 [9:27:44<4:19:01,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5174', 'grad_norm': '0.4253', 'learning_rate': '5.284e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 30547968, 'tokens/trainable': 30215856, 'epoch': '5.076'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3729/5680 [9:27:44<4:19:01,  7.97s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3730/5680 [9:27:51<4:18:45,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5048', 'grad_norm': '0.4562', 'learning_rate': '5.279e-05', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30556160, 'tokens/trainable': 30224012, 'epoch': '5.076'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3730/5680 [9:27:51<4:18:45,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3731/5680 [9:27:59<4:18:55,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4333', 'grad_norm': '0.3626', 'learning_rate': '5.274e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 30564352, 'tokens/trainable': 30232200, 'epoch': '5.076'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3731/5680 [9:27:59<4:18:55,  7.97s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 3732/5680 [9:28:07<4:18:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3723', 'grad_norm': '0.4241', 'learning_rate': '5.269e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30572544, 'tokens/trainable': 30240300, 'epoch': '5.076'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 3732/5680 [9:28:07<4:18:45,  7.97s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 3733/5680 [9:28:15<4:18:26,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6205', 'grad_norm': '0.4058', 'learning_rate': '5.264e-05', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30580736, 'tokens/trainable': 30248432, 'epoch': '5.076'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 3733/5680 [9:28:15<4:18:26,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 3734/5680 [9:28:23<4:17:57,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5547', 'grad_norm': '0.4602', 'learning_rate': '5.259e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 30588928, 'tokens/trainable': 30256594, 'epoch': '5.076'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 3734/5680 [9:28:23<4:17:57,  7.95s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3735/5680 [9:28:31<4:18:11,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3923', 'grad_norm': '0.3088', 'learning_rate': '5.254e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30597120, 'tokens/trainable': 30264754, 'epoch': '5.077'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3735/5680 [9:28:31<4:18:11,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3736/5680 [9:28:39<4:18:35,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4099', 'grad_norm': '0.4462', 'learning_rate': '5.249e-05', 'ppl': '1.507', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 30605312, 'tokens/trainable': 30272900, 'epoch': '5.077'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3736/5680 [9:28:39<4:18:35,  7.98s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3737/5680 [9:28:47<4:17:53,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5848', 'grad_norm': '0.4266', 'learning_rate': '5.245e-05', 'ppl': '1.795', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 30613504, 'tokens/trainable': 30281092, 'epoch': '5.077'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3737/5680 [9:28:47<4:17:53,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3738/5680 [9:28:56<4:20:55,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.282', 'grad_norm': '0.3994', 'learning_rate': '5.24e-05', 'ppl': '1.326', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.1', 'tokens/total': 30621696, 'tokens/trainable': 30289172, 'epoch': '5.077'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 3738/5680 [9:28:56<4:20:55,  8.06s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3739/5680 [9:29:04<4:19:59,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.5634', 'grad_norm': '0.4127', 'learning_rate': '5.235e-05', 'ppl': '1.757', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30629888, 'tokens/trainable': 30297332, 'epoch': '5.077'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3739/5680 [9:29:04<4:19:59,  8.04s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3740/5680 [9:29:11<4:19:18,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4918', 'grad_norm': '0.4565', 'learning_rate': '5.23e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30638080, 'tokens/trainable': 30305488, 'epoch': '5.077'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3740/5680 [9:29:11<4:19:18,  8.02s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3741/5680 [9:29:20<4:19:04,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.666', 'grad_norm': '0.4777', 'learning_rate': '5.225e-05', 'ppl': '1.946', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 30646272, 'tokens/trainable': 30313644, 'epoch': '5.078'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3741/5680 [9:29:20<4:19:04,  8.02s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3742/5680 [9:29:27<4:18:13,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5988', 'grad_norm': '0.409', 'learning_rate': '5.22e-05', 'ppl': '1.82', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 30654464, 'tokens/trainable': 30321832, 'epoch': '5.078'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3742/5680 [9:29:27<4:18:13,  7.99s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 3743/5680 [9:29:35<4:17:52,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4179', 'grad_norm': '0.394', 'learning_rate': '5.215e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30662656, 'tokens/trainable': 30329988, 'epoch': '5.078'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 3743/5680 [9:29:35<4:17:52,  7.99s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 3744/5680 [9:29:43<4:18:01,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3955', 'grad_norm': '0.3562', 'learning_rate': '5.211e-05', 'ppl': '1.485', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30670848, 'tokens/trainable': 30338158, 'epoch': '5.078'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 3744/5680 [9:29:43<4:18:01,  8.00s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 3745/5680 [9:29:51<4:18:01,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6108', 'grad_norm': '0.3948', 'learning_rate': '5.206e-05', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30679040, 'tokens/trainable': 30346288, 'epoch': '5.078'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 3745/5680 [9:29:51<4:18:01,  8.00s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3746/5680 [9:29:59<4:17:44,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4574', 'grad_norm': '0.322', 'learning_rate': '5.201e-05', 'ppl': '1.58', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30687232, 'tokens/trainable': 30354440, 'epoch': '5.079'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3746/5680 [9:29:59<4:17:44,  8.00s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3747/5680 [9:30:07<4:17:22,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4923', 'grad_norm': '0.3966', 'learning_rate': '5.196e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30695424, 'tokens/trainable': 30362548, 'epoch': '5.079'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3747/5680 [9:30:07<4:17:22,  7.99s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3748/5680 [9:30:15<4:17:09,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3904', 'grad_norm': '0.3937', 'learning_rate': '5.191e-05', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30703616, 'tokens/trainable': 30370690, 'epoch': '5.079'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3748/5680 [9:30:15<4:17:09,  7.99s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3749/5680 [9:30:23<4:16:32,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6404', 'grad_norm': '0.4301', 'learning_rate': '5.186e-05', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 30711808, 'tokens/trainable': 30378860, 'epoch': '5.079'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 3749/5680 [9:30:23<4:16:32,  7.97s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3750/5680 [9:30:31<4:16:37,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3049', 'grad_norm': '0.3144', 'learning_rate': '5.181e-05', 'ppl': '1.357', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 30720000, 'tokens/trainable': 30386964, 'epoch': '5.079'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3750/5680 [9:30:31<4:16:37,  7.98s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3751/5680 [9:30:39<4:16:32,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5409', 'grad_norm': '0.3602', 'learning_rate': '5.177e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 30728192, 'tokens/trainable': 30395144, 'epoch': '5.079'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3751/5680 [9:30:39<4:16:32,  7.98s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3752/5680 [9:30:47<4:15:53,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4797', 'grad_norm': '0.4019', 'learning_rate': '5.172e-05', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 30736384, 'tokens/trainable': 30403304, 'epoch': '5.08'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3752/5680 [9:30:47<4:15:53,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3753/5680 [9:30:55<4:16:16,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3911', 'grad_norm': '0.3936', 'learning_rate': '5.167e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30744576, 'tokens/trainable': 30411484, 'epoch': '5.08'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 3753/5680 [9:30:55<4:16:16,  7.98s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3754/5680 [9:31:03<4:15:52,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5019', 'grad_norm': '0.4159', 'learning_rate': '5.162e-05', 'ppl': '1.652', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30752768, 'tokens/trainable': 30419568, 'epoch': '5.08'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3754/5680 [9:31:03<4:15:52,  7.97s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3755/5680 [9:31:11<4:15:25,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5153', 'grad_norm': '0.4338', 'learning_rate': '5.157e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 30760960, 'tokens/trainable': 30427700, 'epoch': '5.08'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3755/5680 [9:31:11<4:15:25,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3756/5680 [9:31:19<4:15:23,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.393', 'grad_norm': '0.4031', 'learning_rate': '5.152e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30769152, 'tokens/trainable': 30435816, 'epoch': '5.08'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3756/5680 [9:31:19<4:15:23,  7.96s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3757/5680 [9:31:27<4:15:30,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5167', 'grad_norm': '0.4147', 'learning_rate': '5.148e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30777344, 'tokens/trainable': 30443958, 'epoch': '5.08'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                 | 3757/5680 [9:31:27<4:15:30,  7.97s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 3758/5680 [9:31:35<4:15:37,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5331', 'grad_norm': '0.4408', 'learning_rate': '5.143e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30785536, 'tokens/trainable': 30452128, 'epoch': '5.081'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 3758/5680 [9:31:35<4:15:37,  7.98s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 3759/5680 [9:31:43<4:15:10,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4279', 'grad_norm': '0.4889', 'learning_rate': '5.138e-05', 'ppl': '1.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 30793728, 'tokens/trainable': 30460214, 'epoch': '5.081'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 3759/5680 [9:31:43<4:15:10,  7.97s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 3760/5680 [9:31:51<4:15:00,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4261', 'grad_norm': '0.4619', 'learning_rate': '5.133e-05', 'ppl': '1.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30801920, 'tokens/trainable': 30468344, 'epoch': '5.081'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 3760/5680 [9:31:51<4:15:00,  7.97s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3761/5680 [9:31:59<4:15:04,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3497', 'grad_norm': '0.3261', 'learning_rate': '5.128e-05', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 30810112, 'tokens/trainable': 30476442, 'epoch': '5.081'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3761/5680 [9:31:59<4:15:04,  7.98s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3762/5680 [9:32:07<4:14:47,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5455', 'grad_norm': '0.3794', 'learning_rate': '5.123e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 30818304, 'tokens/trainable': 30484604, 'epoch': '5.081'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3762/5680 [9:32:07<4:14:47,  7.97s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3763/5680 [9:32:15<4:15:03,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5801', 'grad_norm': '0.3912', 'learning_rate': '5.119e-05', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30826496, 'tokens/trainable': 30492770, 'epoch': '5.082'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3763/5680 [9:32:15<4:15:03,  7.98s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3764/5680 [9:32:23<4:14:59,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4736', 'grad_norm': '0.3389', 'learning_rate': '5.114e-05', 'ppl': '1.606', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30834688, 'tokens/trainable': 30500912, 'epoch': '5.082'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 3764/5680 [9:32:23<4:14:59,  7.98s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3765/5680 [9:32:31<4:15:32,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.3312', 'learning_rate': '5.109e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 30842880, 'tokens/trainable': 30509080, 'epoch': '5.082'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3765/5680 [9:32:31<4:15:32,  8.01s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3766/5680 [9:32:39<4:15:20,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3681', 'grad_norm': '0.367', 'learning_rate': '5.104e-05', 'ppl': '1.445', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 30851072, 'tokens/trainable': 30517248, 'epoch': '5.082'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3766/5680 [9:32:39<4:15:20,  8.00s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3767/5680 [9:32:47<4:15:23,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6747', 'grad_norm': '0.4089', 'learning_rate': '5.099e-05', 'ppl': '1.963', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30859264, 'tokens/trainable': 30525386, 'epoch': '5.082'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3767/5680 [9:32:47<4:15:23,  8.01s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3768/5680 [9:32:55<4:15:12,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4084', 'grad_norm': '0.3701', 'learning_rate': '5.094e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30867456, 'tokens/trainable': 30533528, 'epoch': '5.082'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 3768/5680 [9:32:55<4:15:12,  8.01s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 3769/5680 [9:33:03<4:15:06,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5274', 'grad_norm': '0.4038', 'learning_rate': '5.09e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 30875648, 'tokens/trainable': 30541668, 'epoch': '5.083'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 3769/5680 [9:33:03<4:15:06,  8.01s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 3770/5680 [9:33:11<4:15:16,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4202', 'grad_norm': '0.3318', 'learning_rate': '5.085e-05', 'ppl': '1.522', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 30883840, 'tokens/trainable': 30549842, 'epoch': '5.083'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 3770/5680 [9:33:11<4:15:16,  8.02s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 3771/5680 [9:33:19<4:14:55,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7018', 'grad_norm': '0.3973', 'learning_rate': '5.08e-05', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30892032, 'tokens/trainable': 30557988, 'epoch': '5.083'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 3771/5680 [9:33:19<4:14:55,  8.01s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3772/5680 [9:33:27<4:14:26,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5078', 'grad_norm': '0.9141', 'learning_rate': '5.075e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 30900224, 'tokens/trainable': 30566160, 'epoch': '5.083'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3772/5680 [9:33:27<4:14:26,  8.00s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3773/5680 [9:33:35<4:14:00,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5311', 'grad_norm': '0.3618', 'learning_rate': '5.07e-05', 'ppl': '1.701', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 30908416, 'tokens/trainable': 30574308, 'epoch': '5.083'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3773/5680 [9:33:35<4:14:00,  7.99s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3774/5680 [9:33:43<4:14:17,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3444', 'grad_norm': '0.377', 'learning_rate': '5.066e-05', 'ppl': '1.411', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 30916608, 'tokens/trainable': 30582440, 'epoch': '5.083'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3774/5680 [9:33:43<4:14:17,  8.00s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3775/5680 [9:33:51<4:14:06,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4728', 'grad_norm': '0.3727', 'learning_rate': '5.061e-05', 'ppl': '1.604', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 30924800, 'tokens/trainable': 30590588, 'epoch': '5.084'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                | 3775/5680 [9:33:51<4:14:06,  8.00s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3776/5680 [9:33:59<4:16:38,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.4452', 'grad_norm': '0.3597', 'learning_rate': '5.056e-05', 'ppl': '1.561', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '976.6', 'tokens/total': 30932992, 'tokens/trainable': 30598676, 'epoch': '5.084'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3776/5680 [9:33:59<4:16:38,  8.09s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3777/5680 [9:34:07<4:15:23,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5323', 'grad_norm': '0.4962', 'learning_rate': '5.051e-05', 'ppl': '1.703', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30941184, 'tokens/trainable': 30606832, 'epoch': '5.084'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3777/5680 [9:34:07<4:15:23,  8.05s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3778/5680 [9:34:15<4:14:35,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5207', 'grad_norm': '0.5174', 'learning_rate': '5.046e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 30949376, 'tokens/trainable': 30614988, 'epoch': '5.084'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3778/5680 [9:34:15<4:14:35,  8.03s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3779/5680 [9:34:23<4:14:55,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.4997', 'grad_norm': '0.418', 'learning_rate': '5.042e-05', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 30957568, 'tokens/trainable': 30623080, 'epoch': '5.084'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 3779/5680 [9:34:23<4:14:55,  8.05s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 3780/5680 [9:34:31<4:14:38,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.391', 'grad_norm': '0.3794', 'learning_rate': '5.037e-05', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 30965760, 'tokens/trainable': 30631196, 'epoch': '5.085'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 3780/5680 [9:34:31<4:14:38,  8.04s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 3781/5680 [9:34:40<4:17:26,  8.13s/it]                                                                                                                                                                                                                                             {'loss': '0.3619', 'grad_norm': '0.4069', 'learning_rate': '5.032e-05', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '976.5', 'tokens/total': 30973952, 'tokens/trainable': 30639348, 'epoch': '5.085'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 3781/5680 [9:34:40<4:17:26,  8.13s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 3782/5680 [9:34:48<4:16:15,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.6201', 'grad_norm': '0.4702', 'learning_rate': '5.027e-05', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 30982144, 'tokens/trainable': 30647492, 'epoch': '5.085'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 3782/5680 [9:34:48<4:16:15,  8.10s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3783/5680 [9:34:56<4:15:27,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.7265', 'grad_norm': '0.4607', 'learning_rate': '5.022e-05', 'ppl': '2.068', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 30990336, 'tokens/trainable': 30655618, 'epoch': '5.085'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3783/5680 [9:34:56<4:15:27,  8.08s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3784/5680 [9:35:04<4:14:22,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.497', 'grad_norm': '0.5733', 'learning_rate': '5.018e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 30998528, 'tokens/trainable': 30663784, 'epoch': '5.085'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3784/5680 [9:35:04<4:14:22,  8.05s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3785/5680 [9:35:12<4:14:01,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7247', 'grad_norm': '0.4572', 'learning_rate': '5.013e-05', 'ppl': '2.064', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 31006720, 'tokens/trainable': 30671858, 'epoch': '5.085'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3785/5680 [9:35:12<4:14:01,  8.04s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3786/5680 [9:35:20<4:13:22,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4465', 'grad_norm': '0.3404', 'learning_rate': '5.008e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 31014912, 'tokens/trainable': 30679972, 'epoch': '5.086'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 3786/5680 [9:35:20<4:13:22,  8.03s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3787/5680 [9:35:28<4:13:26,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4809', 'grad_norm': '0.3792', 'learning_rate': '5.003e-05', 'ppl': '1.618', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 31023104, 'tokens/trainable': 30688040, 'epoch': '5.086'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3787/5680 [9:35:28<4:13:26,  8.03s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3788/5680 [9:35:36<4:13:22,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.2992', 'grad_norm': '0.3759', 'learning_rate': '4.998e-05', 'ppl': '1.349', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 31031296, 'tokens/trainable': 30696088, 'epoch': '5.086'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3788/5680 [9:35:36<4:13:22,  8.04s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3789/5680 [9:35:44<4:13:16,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.603', 'grad_norm': '0.4018', 'learning_rate': '4.994e-05', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 31039488, 'tokens/trainable': 30704268, 'epoch': '5.086'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3789/5680 [9:35:44<4:13:16,  8.04s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3790/5680 [9:35:52<4:12:53,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4763', 'grad_norm': '0.3214', 'learning_rate': '4.989e-05', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 31047680, 'tokens/trainable': 30712420, 'epoch': '5.086'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 3790/5680 [9:35:52<4:12:53,  8.03s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3791/5680 [9:36:00<4:12:40,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.9473', 'grad_norm': '0.4957', 'learning_rate': '4.984e-05', 'ppl': '2.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 31055872, 'tokens/trainable': 30720468, 'epoch': '5.086'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3791/5680 [9:36:00<4:12:40,  8.03s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3792/5680 [9:36:08<4:11:58,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7214', 'grad_norm': '0.4279', 'learning_rate': '4.979e-05', 'ppl': '2.057', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 31064064, 'tokens/trainable': 30728598, 'epoch': '5.087'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3792/5680 [9:36:08<4:11:58,  8.01s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3793/5680 [9:36:16<4:11:47,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4025', 'grad_norm': '0.3786', 'learning_rate': '4.974e-05', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31072256, 'tokens/trainable': 30736714, 'epoch': '5.087'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3793/5680 [9:36:16<4:11:47,  8.01s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3794/5680 [9:36:24<4:11:07,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4897', 'grad_norm': '0.3661', 'learning_rate': '4.97e-05', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 31080448, 'tokens/trainable': 30744790, 'epoch': '5.087'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 3794/5680 [9:36:24<4:11:07,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 3795/5680 [9:36:32<4:11:24,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5397', 'grad_norm': '0.3634', 'learning_rate': '4.965e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 31088640, 'tokens/trainable': 30752864, 'epoch': '5.087'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 3795/5680 [9:36:32<4:11:24,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 3796/5680 [9:36:40<4:11:52,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5085', 'grad_norm': '0.3532', 'learning_rate': '4.96e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.8', 'tokens/total': 31096832, 'tokens/trainable': 30760910, 'epoch': '5.087'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 3796/5680 [9:36:40<4:11:52,  8.02s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 3797/5680 [9:36:48<4:11:58,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.512', 'grad_norm': '0.41', 'learning_rate': '4.955e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 31105024, 'tokens/trainable': 30768976, 'epoch': '5.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 3797/5680 [9:36:48<4:11:58,  8.03s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3798/5680 [9:36:56<4:11:12,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.5574', 'grad_norm': '0.4123', 'learning_rate': '4.951e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 31113216, 'tokens/trainable': 30777160, 'epoch': '5.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3798/5680 [9:36:56<4:11:12,  8.01s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3799/5680 [9:37:04<4:10:39,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3184', 'grad_norm': '0.3636', 'learning_rate': '4.946e-05', 'ppl': '1.375', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 31121408, 'tokens/trainable': 30785336, 'epoch': '5.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3799/5680 [9:37:04<4:10:39,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3800/5680 [9:37:12<4:10:10,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5893', 'grad_norm': '0.4062', 'learning_rate': '4.941e-05', 'ppl': '1.803', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 31129600, 'tokens/trainable': 30793476, 'epoch': '5.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3800/5680 [9:37:12<4:10:10,  7.98s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3801/5680 [9:37:20<4:10:25,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6979', 'grad_norm': '0.5554', 'learning_rate': '4.936e-05', 'ppl': '2.01', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 31137792, 'tokens/trainable': 30801592, 'epoch': '5.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 3801/5680 [9:37:20<4:10:25,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3802/5680 [9:37:28<4:10:09,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6132', 'grad_norm': '0.3599', 'learning_rate': '4.932e-05', 'ppl': '1.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31145984, 'tokens/trainable': 30809684, 'epoch': '5.088'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3802/5680 [9:37:28<4:10:09,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3803/5680 [9:37:36<4:09:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6005', 'grad_norm': '0.4309', 'learning_rate': '4.927e-05', 'ppl': '1.823', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 31154176, 'tokens/trainable': 30817764, 'epoch': '5.089'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3803/5680 [9:37:36<4:09:53,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3804/5680 [9:37:44<4:09:43,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3321', 'grad_norm': '0.3654', 'learning_rate': '4.922e-05', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 31162368, 'tokens/trainable': 30825940, 'epoch': '5.089'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3804/5680 [9:37:44<4:09:43,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3805/5680 [9:37:52<4:09:38,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4286', 'grad_norm': '0.4313', 'learning_rate': '4.917e-05', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 31170560, 'tokens/trainable': 30834108, 'epoch': '5.089'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 3805/5680 [9:37:52<4:09:38,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 3806/5680 [9:38:00<4:09:31,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3647', 'grad_norm': '0.3835', 'learning_rate': '4.912e-05', 'ppl': '1.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.8', 'tokens/total': 31178752, 'tokens/trainable': 30842088, 'epoch': '5.089'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 3806/5680 [9:38:00<4:09:31,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 3807/5680 [9:38:08<4:09:22,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4931', 'grad_norm': '0.5204', 'learning_rate': '4.908e-05', 'ppl': '1.637', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 31186944, 'tokens/trainable': 30850100, 'epoch': '5.089'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 3807/5680 [9:38:08<4:09:22,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 3808/5680 [9:38:16<4:09:27,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6824', 'grad_norm': '0.4063', 'learning_rate': '4.903e-05', 'ppl': '1.979', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 31195136, 'tokens/trainable': 30858282, 'epoch': '5.089'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 3808/5680 [9:38:16<4:09:27,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3809/5680 [9:38:24<4:09:04,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4846', 'grad_norm': '0.3902', 'learning_rate': '4.898e-05', 'ppl': '1.624', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 31203328, 'tokens/trainable': 30866458, 'epoch': '5.09'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3809/5680 [9:38:24<4:09:04,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3810/5680 [9:38:32<4:08:45,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.2329', 'grad_norm': '0.3262', 'learning_rate': '4.893e-05', 'ppl': '1.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 31211520, 'tokens/trainable': 30874600, 'epoch': '5.09'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3810/5680 [9:38:32<4:08:45,  7.98s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3811/5680 [9:38:40<4:09:02,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4799', 'grad_norm': '0.3383', 'learning_rate': '4.889e-05', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 31219712, 'tokens/trainable': 30882686, 'epoch': '5.09'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3811/5680 [9:38:40<4:09:02,  7.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3812/5680 [9:38:48<4:09:00,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6032', 'grad_norm': '0.387', 'learning_rate': '4.884e-05', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.1', 'tokens/total': 31227904, 'tokens/trainable': 30890676, 'epoch': '5.09'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3812/5680 [9:38:48<4:09:00,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3813/5680 [9:38:56<4:08:55,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4227', 'grad_norm': '0.3525', 'learning_rate': '4.879e-05', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 31236096, 'tokens/trainable': 30898780, 'epoch': '5.09'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3813/5680 [9:38:56<4:08:55,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3814/5680 [9:39:04<4:08:52,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5252', 'grad_norm': '0.4457', 'learning_rate': '4.874e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.7', 'tokens/total': 31244288, 'tokens/trainable': 30906744, 'epoch': '5.09'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3814/5680 [9:39:04<4:08:52,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3815/5680 [9:39:12<4:08:32,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.6429', 'grad_norm': '0.4164', 'learning_rate': '4.87e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 31252480, 'tokens/trainable': 30914820, 'epoch': '5.091'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3815/5680 [9:39:12<4:08:32,  8.00s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3816/5680 [9:39:20<4:08:16,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4497', 'grad_norm': '0.4033', 'learning_rate': '4.865e-05', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.7', 'tokens/total': 31260672, 'tokens/trainable': 30922796, 'epoch': '5.091'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 3816/5680 [9:39:20<4:08:16,  7.99s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 3817/5680 [9:39:28<4:08:30,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3998', 'grad_norm': '0.4112', 'learning_rate': '4.86e-05', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 31268864, 'tokens/trainable': 30930840, 'epoch': '5.091'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 3817/5680 [9:39:28<4:08:30,  8.00s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 3818/5680 [9:39:36<4:08:19,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4522', 'grad_norm': '0.3793', 'learning_rate': '4.855e-05', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 31277056, 'tokens/trainable': 30938892, 'epoch': '5.091'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 3818/5680 [9:39:36<4:08:19,  8.00s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 3819/5680 [9:39:44<4:07:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3886', 'grad_norm': '0.3633', 'learning_rate': '4.851e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 31285248, 'tokens/trainable': 30947056, 'epoch': '5.091'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 3819/5680 [9:39:44<4:07:53,  7.99s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3820/5680 [9:39:52<4:07:31,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7599', 'grad_norm': '0.378', 'learning_rate': '4.846e-05', 'ppl': '2.138', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 31293440, 'tokens/trainable': 30955096, 'epoch': '5.092'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3820/5680 [9:39:52<4:07:31,  7.98s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3821/5680 [9:40:00<4:07:27,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.748', 'grad_norm': '0.4243', 'learning_rate': '4.841e-05', 'ppl': '2.113', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 31301632, 'tokens/trainable': 30963184, 'epoch': '5.092'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3821/5680 [9:40:00<4:07:27,  7.99s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3822/5680 [9:40:08<4:06:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3569', 'grad_norm': '0.3459', 'learning_rate': '4.836e-05', 'ppl': '1.429', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 31309824, 'tokens/trainable': 30971272, 'epoch': '5.092'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3822/5680 [9:40:08<4:06:45,  7.97s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3823/5680 [9:40:16<4:07:01,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4262', 'grad_norm': '0.3373', 'learning_rate': '4.832e-05', 'ppl': '1.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 31318016, 'tokens/trainable': 30979364, 'epoch': '5.092'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 3823/5680 [9:40:16<4:07:01,  7.98s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3824/5680 [9:40:24<4:10:14,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.4555', 'grad_norm': '0.4315', 'learning_rate': '4.827e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.5', 'tokens/total': 31326208, 'tokens/trainable': 30987500, 'epoch': '5.092'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3824/5680 [9:40:24<4:10:14,  8.09s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3825/5680 [9:40:32<4:09:13,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.3075', 'grad_norm': '0.3174', 'learning_rate': '4.822e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31334400, 'tokens/trainable': 30995604, 'epoch': '5.092'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3825/5680 [9:40:32<4:09:13,  8.06s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3826/5680 [9:40:40<4:08:09,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5169', 'grad_norm': '0.3789', 'learning_rate': '4.818e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 31342592, 'tokens/trainable': 31003760, 'epoch': '5.093'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3826/5680 [9:40:40<4:08:09,  8.03s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3827/5680 [9:40:48<4:07:53,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4985', 'grad_norm': '0.3835', 'learning_rate': '4.813e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 31350784, 'tokens/trainable': 31011864, 'epoch': '5.093'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3827/5680 [9:40:48<4:07:53,  8.03s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3828/5680 [9:40:56<4:07:15,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3297', 'grad_norm': '0.3868', 'learning_rate': '4.808e-05', 'ppl': '1.391', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 31358976, 'tokens/trainable': 31019888, 'epoch': '5.093'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3828/5680 [9:40:56<4:07:15,  8.01s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3829/5680 [9:41:04<4:07:00,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.444', 'grad_norm': '0.375', 'learning_rate': '4.803e-05', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 31367168, 'tokens/trainable': 31027912, 'epoch': '5.093'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3829/5680 [9:41:04<4:07:00,  8.01s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3830/5680 [9:41:12<4:06:15,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3592', 'grad_norm': '0.3446', 'learning_rate': '4.799e-05', 'ppl': '1.432', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 31375360, 'tokens/trainable': 31035938, 'epoch': '5.093'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3830/5680 [9:41:12<4:06:15,  7.99s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3831/5680 [9:41:20<4:06:15,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7304', 'grad_norm': '0.4196', 'learning_rate': '4.794e-05', 'ppl': '2.076', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 31383552, 'tokens/trainable': 31044072, 'epoch': '5.093'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 3831/5680 [9:41:20<4:06:15,  7.99s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3832/5680 [9:41:28<4:06:18,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.7028', 'grad_norm': '0.4025', 'learning_rate': '4.789e-05', 'ppl': '2.019', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 31391744, 'tokens/trainable': 31052224, 'epoch': '5.094'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3832/5680 [9:41:28<4:06:18,  8.00s/it] 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3833/5680 [9:41:36<4:06:13,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.333', 'grad_norm': '0.3823', 'learning_rate': '4.784e-05', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 31399936, 'tokens/trainable': 31060296, 'epoch': '5.094'}
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3833/5680 [9:41:36<4:06:13,  8.00s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3834/5680 [9:41:44<4:05:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6908', 'grad_norm': '0.4152', 'learning_rate': '4.78e-05', 'ppl': '1.995', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 31408128, 'tokens/trainable': 31068356, 'epoch': '5.094'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3834/5680 [9:41:44<4:05:53,  7.99s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3835/5680 [9:41:52<4:05:29,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5465', 'grad_norm': '0.367', 'learning_rate': '4.775e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 31416320, 'tokens/trainable': 31076520, 'epoch': '5.094'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3835/5680 [9:41:52<4:05:29,  7.98s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3836/5680 [9:42:00<4:05:11,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4764', 'grad_norm': '0.3472', 'learning_rate': '4.77e-05', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 31424512, 'tokens/trainable': 31084516, 'epoch': '5.094'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3836/5680 [9:42:00<4:05:11,  7.98s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3837/5680 [9:42:08<4:05:01,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5084', 'grad_norm': '0.5546', 'learning_rate': '4.766e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 31432704, 'tokens/trainable': 31092530, 'epoch': '5.095'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3837/5680 [9:42:08<4:05:01,  7.98s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3838/5680 [9:42:16<4:05:13,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.579', 'grad_norm': '0.4671', 'learning_rate': '4.761e-05', 'ppl': '1.784', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 31440896, 'tokens/trainable': 31100544, 'epoch': '5.095'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 3838/5680 [9:42:16<4:05:13,  7.99s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3839/5680 [9:42:24<4:05:13,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5235', 'grad_norm': '0.3638', 'learning_rate': '4.756e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 31449088, 'tokens/trainable': 31108596, 'epoch': '5.095'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3839/5680 [9:42:24<4:05:13,  7.99s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3840/5680 [9:42:32<4:04:58,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3747', 'grad_norm': '0.3585', 'learning_rate': '4.751e-05', 'ppl': '1.455', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31457280, 'tokens/trainable': 31116682, 'epoch': '5.095'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3840/5680 [9:42:32<4:04:58,  7.99s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3841/5680 [9:42:40<4:04:43,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4139', 'grad_norm': '0.3734', 'learning_rate': '4.747e-05', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 31465472, 'tokens/trainable': 31124792, 'epoch': '5.095'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3841/5680 [9:42:40<4:04:43,  7.98s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3842/5680 [9:42:48<4:04:15,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4873', 'grad_norm': '0.4767', 'learning_rate': '4.742e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.3', 'tokens/total': 31473664, 'tokens/trainable': 31132684, 'epoch': '5.095'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 3842/5680 [9:42:48<4:04:15,  7.97s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 3843/5680 [9:42:56<4:04:12,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.7517', 'grad_norm': '0.4264', 'learning_rate': '4.737e-05', 'ppl': '2.121', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 31481856, 'tokens/trainable': 31140856, 'epoch': '5.096'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 3843/5680 [9:42:56<4:04:12,  7.98s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 3844/5680 [9:43:04<4:04:12,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4282', 'grad_norm': '0.4971', 'learning_rate': '4.733e-05', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 31490048, 'tokens/trainable': 31148948, 'epoch': '5.096'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 3844/5680 [9:43:04<4:04:12,  7.98s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 3845/5680 [9:43:12<4:04:09,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4828', 'grad_norm': '0.3605', 'learning_rate': '4.728e-05', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 31498240, 'tokens/trainable': 31156970, 'epoch': '5.096'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 3845/5680 [9:43:12<4:04:09,  7.98s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3846/5680 [9:43:20<4:03:57,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6376', 'grad_norm': '0.4379', 'learning_rate': '4.723e-05', 'ppl': '1.892', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 31506432, 'tokens/trainable': 31165100, 'epoch': '5.096'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3846/5680 [9:43:20<4:03:57,  7.98s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3847/5680 [9:43:28<4:04:07,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5526', 'grad_norm': '0.4081', 'learning_rate': '4.719e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31514624, 'tokens/trainable': 31173224, 'epoch': '5.096'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3847/5680 [9:43:28<4:04:07,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3848/5680 [9:43:36<4:04:02,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4167', 'grad_norm': '0.3514', 'learning_rate': '4.714e-05', 'ppl': '1.517', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 31522816, 'tokens/trainable': 31181404, 'epoch': '5.096'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3848/5680 [9:43:36<4:04:02,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3849/5680 [9:43:44<4:03:51,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5093', 'grad_norm': '0.4626', 'learning_rate': '4.709e-05', 'ppl': '1.664', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.7', 'tokens/total': 31531008, 'tokens/trainable': 31189362, 'epoch': '5.097'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3849/5680 [9:43:44<4:03:51,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3850/5680 [9:43:52<4:03:33,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5636', 'grad_norm': '0.4033', 'learning_rate': '4.704e-05', 'ppl': '1.757', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 31539200, 'tokens/trainable': 31197478, 'epoch': '5.097'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3850/5680 [9:43:52<4:03:33,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3851/5680 [9:44:00<4:03:21,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6397', 'grad_norm': '0.3657', 'learning_rate': '4.7e-05', 'ppl': '1.896', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 31547392, 'tokens/trainable': 31205668, 'epoch': '5.097'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3851/5680 [9:44:00<4:03:21,  7.98s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3852/5680 [9:44:08<4:03:28,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6436', 'grad_norm': '0.3921', 'learning_rate': '4.695e-05', 'ppl': '1.903', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 31555584, 'tokens/trainable': 31213820, 'epoch': '5.097'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3852/5680 [9:44:08<4:03:28,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3853/5680 [9:44:16<4:03:19,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3331', 'grad_norm': '0.3276', 'learning_rate': '4.69e-05', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 31563776, 'tokens/trainable': 31221940, 'epoch': '5.097'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 3853/5680 [9:44:16<4:03:19,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3854/5680 [9:44:24<4:03:06,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.658', 'grad_norm': '0.4322', 'learning_rate': '4.686e-05', 'ppl': '1.931', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 31571968, 'tokens/trainable': 31229956, 'epoch': '5.098'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3854/5680 [9:44:24<4:03:06,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3855/5680 [9:44:32<4:03:02,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6442', 'grad_norm': '0.3696', 'learning_rate': '4.681e-05', 'ppl': '1.904', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.7', 'tokens/total': 31580160, 'tokens/trainable': 31237940, 'epoch': '5.098'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3855/5680 [9:44:32<4:03:02,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3856/5680 [9:44:40<4:02:56,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4912', 'grad_norm': '0.3516', 'learning_rate': '4.676e-05', 'ppl': '1.634', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 31588352, 'tokens/trainable': 31246096, 'epoch': '5.098'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3856/5680 [9:44:40<4:02:56,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3857/5680 [9:44:48<4:02:40,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5989', 'grad_norm': '0.5673', 'learning_rate': '4.672e-05', 'ppl': '1.82', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 31596544, 'tokens/trainable': 31254270, 'epoch': '5.098'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3857/5680 [9:44:48<4:02:40,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3858/5680 [9:44:56<4:02:19,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5488', 'grad_norm': '0.3707', 'learning_rate': '4.667e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.4', 'tokens/total': 31604736, 'tokens/trainable': 31262194, 'epoch': '5.098'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3858/5680 [9:44:56<4:02:19,  7.98s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3859/5680 [9:45:03<4:01:49,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3911', 'grad_norm': '0.3754', 'learning_rate': '4.662e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 31612928, 'tokens/trainable': 31270308, 'epoch': '5.098'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3859/5680 [9:45:03<4:01:49,  7.97s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3860/5680 [9:45:11<4:01:35,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4979', 'grad_norm': '0.4027', 'learning_rate': '4.658e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 31621120, 'tokens/trainable': 31278396, 'epoch': '5.099'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 3860/5680 [9:45:11<4:01:35,  7.96s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3861/5680 [9:45:19<4:01:45,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5527', 'grad_norm': '0.4299', 'learning_rate': '4.653e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988', 'tokens/total': 31629312, 'tokens/trainable': 31286296, 'epoch': '5.099'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3861/5680 [9:45:19<4:01:45,  7.97s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3862/5680 [9:45:27<4:01:24,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4856', 'grad_norm': '0.4188', 'learning_rate': '4.648e-05', 'ppl': '1.625', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.3', 'tokens/total': 31637504, 'tokens/trainable': 31294190, 'epoch': '5.099'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3862/5680 [9:45:27<4:01:24,  7.97s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3863/5680 [9:45:35<4:01:11,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6486', 'grad_norm': '0.4834', 'learning_rate': '4.644e-05', 'ppl': '1.913', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.3', 'tokens/total': 31645696, 'tokens/trainable': 31302134, 'epoch': '5.099'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3863/5680 [9:45:35<4:01:11,  7.96s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3864/5680 [9:45:43<4:01:15,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.657', 'grad_norm': '0.3968', 'learning_rate': '4.639e-05', 'ppl': '1.929', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.1', 'tokens/total': 31653888, 'tokens/trainable': 31310032, 'epoch': '5.099'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 3864/5680 [9:45:43<4:01:15,  7.97s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3865/5680 [9:45:51<4:01:47,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4336', 'grad_norm': '0.3265', 'learning_rate': '4.634e-05', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 31662080, 'tokens/trainable': 31318110, 'epoch': '5.099'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3865/5680 [9:45:51<4:01:47,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3866/5680 [9:45:59<4:01:41,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5578', 'grad_norm': '0.3834', 'learning_rate': '4.63e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 31670272, 'tokens/trainable': 31326204, 'epoch': '5.1'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3866/5680 [9:45:59<4:01:41,  7.99s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3867/5680 [9:46:08<4:05:13,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.4386', 'grad_norm': '0.711', 'learning_rate': '4.625e-05', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.9', 'tokens/total': 31678464, 'tokens/trainable': 31334180, 'epoch': '5.1'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3867/5680 [9:46:08<4:05:13,  8.12s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3868/5680 [9:46:16<4:04:02,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.292', 'grad_norm': '0.3532', 'learning_rate': '4.62e-05', 'ppl': '1.339', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 31686656, 'tokens/trainable': 31342348, 'epoch': '5.1'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 3868/5680 [9:46:16<4:04:02,  8.08s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 3869/5680 [9:46:24<4:03:27,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.6639', 'grad_norm': '0.3932', 'learning_rate': '4.616e-05', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 31694848, 'tokens/trainable': 31350500, 'epoch': '5.1'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 3869/5680 [9:46:24<4:03:27,  8.07s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 3870/5680 [9:46:32<4:02:31,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.2771', 'grad_norm': '0.3359', 'learning_rate': '4.611e-05', 'ppl': '1.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.5', 'tokens/total': 31703040, 'tokens/trainable': 31358376, 'epoch': '5.1'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 3870/5680 [9:46:32<4:02:31,  8.04s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 3871/5680 [9:46:40<4:01:44,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5413', 'grad_norm': '0.3672', 'learning_rate': '4.606e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 31711232, 'tokens/trainable': 31366320, 'epoch': '5.101'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 3871/5680 [9:46:40<4:01:44,  8.02s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3872/5680 [9:46:48<4:01:23,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.65', 'grad_norm': '0.4144', 'learning_rate': '4.602e-05', 'ppl': '1.916', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 31719424, 'tokens/trainable': 31374324, 'epoch': '5.101'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3872/5680 [9:46:48<4:01:23,  8.01s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3873/5680 [9:46:56<4:01:03,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5974', 'grad_norm': '0.4724', 'learning_rate': '4.597e-05', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 31727616, 'tokens/trainable': 31382492, 'epoch': '5.101'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3873/5680 [9:46:56<4:01:03,  8.00s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3874/5680 [9:47:04<4:00:52,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.511', 'grad_norm': '0.3932', 'learning_rate': '4.592e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 31735808, 'tokens/trainable': 31390656, 'epoch': '5.101'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3874/5680 [9:47:04<4:00:52,  8.00s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3875/5680 [9:47:12<4:00:24,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5186', 'grad_norm': '0.3995', 'learning_rate': '4.588e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 31744000, 'tokens/trainable': 31398836, 'epoch': '5.101'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                             | 3875/5680 [9:47:12<4:00:24,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3876/5680 [9:47:20<4:00:00,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3301', 'grad_norm': '0.364', 'learning_rate': '4.583e-05', 'ppl': '1.391', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 31752192, 'tokens/trainable': 31406860, 'epoch': '5.101'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3876/5680 [9:47:20<4:00:00,  7.98s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3877/5680 [9:47:28<3:59:55,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4655', 'grad_norm': '0.3754', 'learning_rate': '4.578e-05', 'ppl': '1.593', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999', 'tokens/total': 31760384, 'tokens/trainable': 31414838, 'epoch': '5.102'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3877/5680 [9:47:28<3:59:55,  7.98s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3878/5680 [9:47:36<4:00:06,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5318', 'grad_norm': '0.4088', 'learning_rate': '4.574e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.2', 'tokens/total': 31768576, 'tokens/trainable': 31422448, 'epoch': '5.102'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3878/5680 [9:47:36<4:00:06,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3879/5680 [9:47:44<4:00:50,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4094', 'grad_norm': '0.4308', 'learning_rate': '4.569e-05', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.5', 'tokens/total': 31776768, 'tokens/trainable': 31430508, 'epoch': '5.102'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 3879/5680 [9:47:44<4:00:50,  8.02s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 3880/5680 [9:47:52<4:00:08,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.5364', 'grad_norm': '0.3698', 'learning_rate': '4.564e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988', 'tokens/total': 31784960, 'tokens/trainable': 31438372, 'epoch': '5.102'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 3880/5680 [9:47:52<4:00:08,  8.00s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 3881/5680 [9:48:00<3:59:28,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3375', 'grad_norm': '0.408', 'learning_rate': '4.56e-05', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.1', 'tokens/total': 31793152, 'tokens/trainable': 31446252, 'epoch': '5.102'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 3881/5680 [9:48:00<3:59:28,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 3882/5680 [9:48:08<3:59:25,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5335', 'grad_norm': '0.4105', 'learning_rate': '4.555e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980.4', 'tokens/total': 31801344, 'tokens/trainable': 31454088, 'epoch': '5.102'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 3882/5680 [9:48:08<3:59:25,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3883/5680 [9:48:16<3:59:13,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5006', 'grad_norm': '0.3637', 'learning_rate': '4.551e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.4', 'tokens/total': 31809536, 'tokens/trainable': 31461976, 'epoch': '5.103'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3883/5680 [9:48:16<3:59:13,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3884/5680 [9:48:24<3:58:59,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5513', 'grad_norm': '0.4039', 'learning_rate': '4.546e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.7', 'tokens/total': 31817728, 'tokens/trainable': 31469852, 'epoch': '5.103'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3884/5680 [9:48:24<3:58:59,  7.98s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3885/5680 [9:48:32<3:58:29,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.7419', 'grad_norm': '0.3991', 'learning_rate': '4.541e-05', 'ppl': '2.1', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.1', 'tokens/total': 31825920, 'tokens/trainable': 31477732, 'epoch': '5.103'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3885/5680 [9:48:32<3:58:29,  7.97s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3886/5680 [9:48:40<3:58:26,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5931', 'grad_norm': '0.4341', 'learning_rate': '4.537e-05', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 31834112, 'tokens/trainable': 31485856, 'epoch': '5.103'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 3886/5680 [9:48:40<3:58:26,  7.97s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3887/5680 [9:48:48<3:58:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.8307', 'grad_norm': '0.4051', 'learning_rate': '4.532e-05', 'ppl': '2.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31842304, 'tokens/trainable': 31494006, 'epoch': '5.103'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3887/5680 [9:48:48<3:58:53,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3888/5680 [9:48:56<3:58:33,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3925', 'grad_norm': '0.3562', 'learning_rate': '4.527e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 31850496, 'tokens/trainable': 31502100, 'epoch': '5.104'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3888/5680 [9:48:56<3:58:33,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3889/5680 [9:49:04<3:58:31,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4531', 'grad_norm': '0.4071', 'learning_rate': '4.523e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 31858688, 'tokens/trainable': 31510106, 'epoch': '5.104'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3889/5680 [9:49:04<3:58:31,  7.99s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3890/5680 [9:49:12<3:58:27,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7442', 'grad_norm': '0.3975', 'learning_rate': '4.518e-05', 'ppl': '2.105', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 31866880, 'tokens/trainable': 31518226, 'epoch': '5.104'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 3890/5680 [9:49:12<3:58:27,  7.99s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 3891/5680 [9:49:20<3:58:01,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4609', 'grad_norm': '0.3941', 'learning_rate': '4.513e-05', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 31875072, 'tokens/trainable': 31526404, 'epoch': '5.104'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 3891/5680 [9:49:20<3:58:01,  7.98s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 3892/5680 [9:49:27<3:57:54,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3824', 'grad_norm': '0.4249', 'learning_rate': '4.509e-05', 'ppl': '1.466', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 31883264, 'tokens/trainable': 31534568, 'epoch': '5.104'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 3892/5680 [9:49:27<3:57:54,  7.98s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 3893/5680 [9:49:35<3:57:52,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5348', 'grad_norm': '0.3698', 'learning_rate': '4.504e-05', 'ppl': '1.707', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.7', 'tokens/total': 31891456, 'tokens/trainable': 31542412, 'epoch': '5.104'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 3893/5680 [9:49:35<3:57:52,  7.99s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3894/5680 [9:49:44<4:00:43,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.5389', 'grad_norm': '0.4034', 'learning_rate': '4.5e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.3', 'tokens/total': 31899648, 'tokens/trainable': 31550308, 'epoch': '5.105'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3894/5680 [9:49:44<4:00:43,  8.09s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3895/5680 [9:49:52<3:59:31,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.3641', 'grad_norm': '0.4637', 'learning_rate': '4.495e-05', 'ppl': '1.439', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.4', 'tokens/total': 31907840, 'tokens/trainable': 31558072, 'epoch': '5.105'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3895/5680 [9:49:52<3:59:31,  8.05s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3896/5680 [9:50:00<3:58:34,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.3609', 'grad_norm': '0.3921', 'learning_rate': '4.49e-05', 'ppl': '1.435', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 31916032, 'tokens/trainable': 31566198, 'epoch': '5.105'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3896/5680 [9:50:00<3:58:34,  8.02s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3897/5680 [9:50:08<3:57:29,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6346', 'grad_norm': '0.4508', 'learning_rate': '4.486e-05', 'ppl': '1.886', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.9', 'tokens/total': 31924224, 'tokens/trainable': 31574104, 'epoch': '5.105'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 3897/5680 [9:50:08<3:57:29,  7.99s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3898/5680 [9:50:16<3:56:37,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.2582', 'grad_norm': '0.3602', 'learning_rate': '4.481e-05', 'ppl': '1.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '970.3', 'tokens/total': 31932416, 'tokens/trainable': 31581776, 'epoch': '5.105'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3898/5680 [9:50:16<3:56:37,  7.97s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3899/5680 [9:50:24<3:56:34,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.63', 'grad_norm': '0.4045', 'learning_rate': '4.477e-05', 'ppl': '1.878', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 31940608, 'tokens/trainable': 31589758, 'epoch': '5.105'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3899/5680 [9:50:24<3:56:34,  7.97s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3900/5680 [9:50:31<3:55:59,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5594', 'grad_norm': '0.4395', 'learning_rate': '4.472e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 31948800, 'tokens/trainable': 31597934, 'epoch': '5.106'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3900/5680 [9:50:31<3:55:59,  7.95s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3901/5680 [9:50:39<3:55:51,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4459', 'grad_norm': '0.3837', 'learning_rate': '4.467e-05', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.2', 'tokens/total': 31956992, 'tokens/trainable': 31605880, 'epoch': '5.106'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                            | 3901/5680 [9:50:39<3:55:51,  7.96s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 3902/5680 [9:50:47<3:55:35,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4555', 'grad_norm': '0.3738', 'learning_rate': '4.463e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 31965184, 'tokens/trainable': 31613896, 'epoch': '5.106'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 3902/5680 [9:50:47<3:55:35,  7.95s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 3903/5680 [9:50:55<3:55:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4047', 'grad_norm': '0.3977', 'learning_rate': '4.458e-05', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.4', 'tokens/total': 31973376, 'tokens/trainable': 31621710, 'epoch': '5.106'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 3903/5680 [9:50:55<3:55:17,  7.94s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 3904/5680 [9:51:03<3:55:03,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4507', 'grad_norm': '0.3715', 'learning_rate': '4.453e-05', 'ppl': '1.569', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 31981568, 'tokens/trainable': 31629748, 'epoch': '5.106'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 3904/5680 [9:51:03<3:55:03,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3905/5680 [9:51:11<3:55:10,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6708', 'grad_norm': '0.4914', 'learning_rate': '4.449e-05', 'ppl': '1.956', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.9', 'tokens/total': 31989760, 'tokens/trainable': 31637700, 'epoch': '5.107'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3905/5680 [9:51:11<3:55:10,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3906/5680 [9:51:19<3:54:53,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5416', 'grad_norm': '0.449', 'learning_rate': '4.444e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 31997952, 'tokens/trainable': 31645680, 'epoch': '5.107'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3906/5680 [9:51:19<3:54:53,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3907/5680 [9:51:27<3:54:47,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3034', 'grad_norm': '0.3852', 'learning_rate': '4.44e-05', 'ppl': '1.354', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.9', 'tokens/total': 32006144, 'tokens/trainable': 31653592, 'epoch': '5.107'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3907/5680 [9:51:27<3:54:47,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3908/5680 [9:51:35<3:54:52,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6326', 'grad_norm': '0.4137', 'learning_rate': '4.435e-05', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '946.5', 'tokens/total': 32014336, 'tokens/trainable': 31661134, 'epoch': '5.107'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                            | 3908/5680 [9:51:35<3:54:52,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3909/5680 [9:51:43<3:54:53,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.352', 'grad_norm': '0.4097', 'learning_rate': '4.431e-05', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 32022528, 'tokens/trainable': 31669264, 'epoch': '5.107'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3909/5680 [9:51:43<3:54:53,  7.96s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3910/5680 [9:51:51<3:57:12,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.6701', 'grad_norm': '0.4251', 'learning_rate': '4.426e-05', 'ppl': '1.955', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.5', 'tokens/total': 32030720, 'tokens/trainable': 31677270, 'epoch': '5.107'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3910/5680 [9:51:51<3:57:12,  8.04s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3911/5680 [9:51:59<3:56:09,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4452', 'grad_norm': '0.4107', 'learning_rate': '4.421e-05', 'ppl': '1.561', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 32038912, 'tokens/trainable': 31685388, 'epoch': '5.108'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3911/5680 [9:51:59<3:56:09,  8.01s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3912/5680 [9:52:07<3:55:18,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4703', 'grad_norm': '0.4304', 'learning_rate': '4.417e-05', 'ppl': '1.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 32047104, 'tokens/trainable': 31693540, 'epoch': '5.108'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3912/5680 [9:52:07<3:55:18,  7.99s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3913/5680 [9:52:15<3:54:55,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4234', 'grad_norm': '0.3975', 'learning_rate': '4.412e-05', 'ppl': '1.527', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '961.9', 'tokens/total': 32055296, 'tokens/trainable': 31701192, 'epoch': '5.108'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3913/5680 [9:52:15<3:54:55,  7.98s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3914/5680 [9:52:23<3:54:17,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3793', 'grad_norm': '0.3613', 'learning_rate': '4.408e-05', 'ppl': '1.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '952.2', 'tokens/total': 32063488, 'tokens/trainable': 31708732, 'epoch': '5.108'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3914/5680 [9:52:23<3:54:17,  7.96s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3915/5680 [9:52:31<3:54:27,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5742', 'grad_norm': '0.4117', 'learning_rate': '4.403e-05', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 32071680, 'tokens/trainable': 31716786, 'epoch': '5.108'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3915/5680 [9:52:31<3:54:27,  7.97s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3916/5680 [9:52:39<3:54:12,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4774', 'grad_norm': '0.4058', 'learning_rate': '4.398e-05', 'ppl': '1.612', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 32079872, 'tokens/trainable': 31724908, 'epoch': '5.108'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                           | 3916/5680 [9:52:39<3:54:12,  7.97s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 3917/5680 [9:52:47<3:54:25,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5042', 'grad_norm': '0.4308', 'learning_rate': '4.394e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 32088064, 'tokens/trainable': 31732982, 'epoch': '5.109'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 3917/5680 [9:52:47<3:54:25,  7.98s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 3918/5680 [9:52:55<3:53:48,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4105', 'grad_norm': '0.3819', 'learning_rate': '4.389e-05', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.3', 'tokens/total': 32096256, 'tokens/trainable': 31740764, 'epoch': '5.109'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 3918/5680 [9:52:55<3:53:48,  7.96s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 3919/5680 [9:53:03<3:53:27,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3163', 'grad_norm': '0.3465', 'learning_rate': '4.385e-05', 'ppl': '1.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '969.8', 'tokens/total': 32104448, 'tokens/trainable': 31748460, 'epoch': '5.109'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 3919/5680 [9:53:03<3:53:27,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3920/5680 [9:53:11<3:53:14,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4753', 'grad_norm': '0.3645', 'learning_rate': '4.38e-05', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 32112640, 'tokens/trainable': 31756488, 'epoch': '5.109'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3920/5680 [9:53:11<3:53:14,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3921/5680 [9:53:19<3:52:53,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5601', 'grad_norm': '0.5052', 'learning_rate': '4.376e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.6', 'tokens/total': 32120832, 'tokens/trainable': 31764408, 'epoch': '5.109'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3921/5680 [9:53:19<3:52:53,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3922/5680 [9:53:27<3:52:57,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5024', 'grad_norm': '0.4342', 'learning_rate': '4.371e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 32129024, 'tokens/trainable': 31772532, 'epoch': '5.11'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3922/5680 [9:53:27<3:52:57,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3923/5680 [9:53:35<3:52:31,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5743', 'grad_norm': '0.3937', 'learning_rate': '4.366e-05', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32137216, 'tokens/trainable': 31780600, 'epoch': '5.11'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 3923/5680 [9:53:35<3:52:31,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3924/5680 [9:53:42<3:52:19,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5147', 'grad_norm': '0.4829', 'learning_rate': '4.362e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 32145408, 'tokens/trainable': 31788748, 'epoch': '5.11'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3924/5680 [9:53:42<3:52:19,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3925/5680 [9:53:50<3:52:31,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5097', 'grad_norm': '0.3674', 'learning_rate': '4.357e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.7', 'tokens/total': 32153600, 'tokens/trainable': 31796442, 'epoch': '5.11'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3925/5680 [9:53:50<3:52:31,  7.95s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3926/5680 [9:53:58<3:51:48,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4526', 'grad_norm': '0.4076', 'learning_rate': '4.353e-05', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '947.6', 'tokens/total': 32161792, 'tokens/trainable': 31803910, 'epoch': '5.11'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3926/5680 [9:53:58<3:51:48,  7.93s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3927/5680 [9:54:06<3:51:44,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3235', 'grad_norm': '0.3444', 'learning_rate': '4.348e-05', 'ppl': '1.382', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.8', 'tokens/total': 32169984, 'tokens/trainable': 31811844, 'epoch': '5.11'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 3927/5680 [9:54:06<3:51:44,  7.93s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 3928/5680 [9:54:14<3:51:28,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6036', 'grad_norm': '0.4573', 'learning_rate': '4.344e-05', 'ppl': '1.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.1', 'tokens/total': 32178176, 'tokens/trainable': 31819656, 'epoch': '5.111'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 3928/5680 [9:54:14<3:51:28,  7.93s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 3929/5680 [9:54:22<3:51:30,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5203', 'grad_norm': '0.4355', 'learning_rate': '4.339e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '958.4', 'tokens/total': 32186368, 'tokens/trainable': 31827270, 'epoch': '5.111'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 3929/5680 [9:54:22<3:51:30,  7.93s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 3930/5680 [9:54:30<3:51:30,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6233', 'grad_norm': '0.4216', 'learning_rate': '4.334e-05', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 32194560, 'tokens/trainable': 31835226, 'epoch': '5.111'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 3930/5680 [9:54:30<3:51:30,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3931/5680 [9:54:38<3:51:16,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6556', 'grad_norm': '0.4396', 'learning_rate': '4.33e-05', 'ppl': '1.926', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 32202752, 'tokens/trainable': 31843196, 'epoch': '5.111'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3931/5680 [9:54:38<3:51:16,  7.93s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3932/5680 [9:54:46<3:50:37,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5001', 'grad_norm': '0.4693', 'learning_rate': '4.325e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 32210944, 'tokens/trainable': 31851346, 'epoch': '5.111'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3932/5680 [9:54:46<3:50:37,  7.92s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3933/5680 [9:54:54<3:51:11,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4407', 'grad_norm': '0.4734', 'learning_rate': '4.321e-05', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.6', 'tokens/total': 32219136, 'tokens/trainable': 31859114, 'epoch': '5.111'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3933/5680 [9:54:54<3:51:11,  7.94s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3934/5680 [9:55:02<3:50:51,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5865', 'grad_norm': '0.3898', 'learning_rate': '4.316e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 32227328, 'tokens/trainable': 31867236, 'epoch': '5.112'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 3934/5680 [9:55:02<3:50:51,  7.93s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3935/5680 [9:55:10<3:51:23,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6483', 'grad_norm': '0.4173', 'learning_rate': '4.312e-05', 'ppl': '1.912', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.2', 'tokens/total': 32235520, 'tokens/trainable': 31874956, 'epoch': '5.112'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3935/5680 [9:55:10<3:51:23,  7.96s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3936/5680 [9:55:18<3:51:03,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5828', 'grad_norm': '0.4263', 'learning_rate': '4.307e-05', 'ppl': '1.791', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 32243712, 'tokens/trainable': 31883100, 'epoch': '5.112'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3936/5680 [9:55:18<3:51:03,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3937/5680 [9:55:26<3:51:01,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2713', 'grad_norm': '0.3989', 'learning_rate': '4.303e-05', 'ppl': '1.312', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994', 'tokens/total': 32251904, 'tokens/trainable': 31891012, 'epoch': '5.112'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3937/5680 [9:55:26<3:51:01,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3938/5680 [9:55:34<3:50:48,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3858', 'grad_norm': '0.3616', 'learning_rate': '4.298e-05', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.7', 'tokens/total': 32260096, 'tokens/trainable': 31898902, 'epoch': '5.112'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3938/5680 [9:55:34<3:50:48,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3939/5680 [9:55:42<3:50:33,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4473', 'grad_norm': '0.4148', 'learning_rate': '4.293e-05', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.1', 'tokens/total': 32268288, 'tokens/trainable': 31906654, 'epoch': '5.112'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3939/5680 [9:55:42<3:50:33,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3940/5680 [9:55:50<3:50:32,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2236', 'grad_norm': '0.3207', 'learning_rate': '4.289e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '951.1', 'tokens/total': 32276480, 'tokens/trainable': 31914224, 'epoch': '5.113'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3940/5680 [9:55:50<3:50:32,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3941/5680 [9:55:58<3:50:22,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3344', 'grad_norm': '0.3759', 'learning_rate': '4.284e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.3', 'tokens/total': 32284672, 'tokens/trainable': 31922108, 'epoch': '5.113'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3941/5680 [9:55:58<3:50:22,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3942/5680 [9:56:05<3:49:55,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3097', 'grad_norm': '0.3896', 'learning_rate': '4.28e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.4', 'tokens/total': 32292864, 'tokens/trainable': 31929868, 'epoch': '5.113'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3942/5680 [9:56:05<3:49:55,  7.94s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3943/5680 [9:56:13<3:49:40,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3927', 'grad_norm': '0.3311', 'learning_rate': '4.275e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 32301056, 'tokens/trainable': 31937850, 'epoch': '5.113'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3943/5680 [9:56:13<3:49:40,  7.93s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3944/5680 [9:56:21<3:50:01,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5209', 'grad_norm': '0.4477', 'learning_rate': '4.271e-05', 'ppl': '1.684', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.3', 'tokens/total': 32309248, 'tokens/trainable': 31945608, 'epoch': '5.113'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3944/5680 [9:56:21<3:50:01,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3945/5680 [9:56:29<3:49:54,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4981', 'grad_norm': '0.3931', 'learning_rate': '4.266e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 32317440, 'tokens/trainable': 31953666, 'epoch': '5.114'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3945/5680 [9:56:29<3:49:54,  7.95s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3946/5680 [9:56:37<3:50:12,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4953', 'grad_norm': '0.4601', 'learning_rate': '4.262e-05', 'ppl': '1.641', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '959.1', 'tokens/total': 32325632, 'tokens/trainable': 31961336, 'epoch': '5.114'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3946/5680 [9:56:37<3:50:12,  7.97s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3947/5680 [9:56:45<3:49:55,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4277', 'grad_norm': '0.3781', 'learning_rate': '4.257e-05', 'ppl': '1.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977', 'tokens/total': 32333824, 'tokens/trainable': 31969100, 'epoch': '5.114'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3947/5680 [9:56:45<3:49:55,  7.96s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3948/5680 [9:56:53<3:49:51,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.2707', 'grad_norm': '0.2919', 'learning_rate': '4.253e-05', 'ppl': '1.311', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32342016, 'tokens/trainable': 31977256, 'epoch': '5.114'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3948/5680 [9:56:53<3:49:51,  7.96s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3949/5680 [9:57:01<3:49:59,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4768', 'grad_norm': '0.3472', 'learning_rate': '4.248e-05', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '955.7', 'tokens/total': 32350208, 'tokens/trainable': 31984892, 'epoch': '5.114'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 3949/5680 [9:57:01<3:49:59,  7.97s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3950/5680 [9:57:09<3:49:29,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5635', 'grad_norm': '0.4787', 'learning_rate': '4.244e-05', 'ppl': '1.757', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 32358400, 'tokens/trainable': 31992924, 'epoch': '5.114'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3950/5680 [9:57:09<3:49:29,  7.96s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3951/5680 [9:57:17<3:49:22,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3541', 'grad_norm': '0.4742', 'learning_rate': '4.239e-05', 'ppl': '1.425', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '955.2', 'tokens/total': 32366592, 'tokens/trainable': 32000526, 'epoch': '5.115'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3951/5680 [9:57:17<3:49:22,  7.96s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3952/5680 [9:57:25<3:48:50,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6418', 'grad_norm': '0.4345', 'learning_rate': '4.235e-05', 'ppl': '1.9', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.9', 'tokens/total': 32374784, 'tokens/trainable': 32008356, 'epoch': '5.115'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3952/5680 [9:57:25<3:48:50,  7.95s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3953/5680 [9:57:33<3:52:27,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.411', 'grad_norm': '0.3701', 'learning_rate': '4.23e-05', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.2', 'tokens/total': 32382976, 'tokens/trainable': 32016308, 'epoch': '5.115'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 3953/5680 [9:57:33<3:52:27,  8.08s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 3954/5680 [9:57:41<3:51:38,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.4592', 'grad_norm': '0.3537', 'learning_rate': '4.226e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981', 'tokens/total': 32391168, 'tokens/trainable': 32024150, 'epoch': '5.115'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 3954/5680 [9:57:41<3:51:38,  8.05s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 3955/5680 [9:57:49<3:51:01,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.7133', 'grad_norm': '0.4559', 'learning_rate': '4.221e-05', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 32399360, 'tokens/trainable': 32032122, 'epoch': '5.115'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 3955/5680 [9:57:49<3:51:01,  8.04s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 3956/5680 [9:57:57<3:50:45,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.3601', 'grad_norm': '0.3791', 'learning_rate': '4.217e-05', 'ppl': '1.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '958.3', 'tokens/total': 32407552, 'tokens/trainable': 32039806, 'epoch': '5.115'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 3956/5680 [9:57:57<3:50:45,  8.03s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3957/5680 [9:58:05<3:50:00,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.3334', 'grad_norm': '0.4026', 'learning_rate': '4.212e-05', 'ppl': '1.396', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '915.1', 'tokens/total': 32415744, 'tokens/trainable': 32047088, 'epoch': '5.116'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3957/5680 [9:58:05<3:50:00,  8.01s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3958/5680 [9:58:13<3:49:46,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4315', 'grad_norm': '0.5396', 'learning_rate': '4.207e-05', 'ppl': '1.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '929.6', 'tokens/total': 32423936, 'tokens/trainable': 32054522, 'epoch': '5.116'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3958/5680 [9:58:13<3:49:46,  8.01s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3959/5680 [9:58:21<3:50:20,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.7118', 'grad_norm': '0.6588', 'learning_rate': '4.203e-05', 'ppl': '2.038', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '767.9', 'tokens/total': 32432128, 'tokens/trainable': 32060730, 'epoch': '5.116'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3959/5680 [9:58:21<3:50:20,  8.03s/it][2026-01-27 07:47:35,600] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:61431] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 07:47:36,966] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:61431] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<04:02, 23.01 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:47, 50.76 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:05<01:05, 81.43 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:46, 113.25 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:32, 158.76 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:06<00:28, 174.89 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:06<00:23, 206.57 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:20, 232.93 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:07<00:18, 259.96 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:07<00:16, 278.66 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:16, 280.38 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:08<00:13, 338.17 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:08<00:15, 283.97 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:14, 296.96 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:09<00:13, 303.17 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:09<00:13, 304.02 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:09<00:12, 307.11 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:10<00:12, 301.37 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:10<00:12, 294.34 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:11<00:11, 300.03 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:11<00:11, 299.70 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:11<00:11, 292.64 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:11<00:08, 364.17 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:12<00:11, 274.70 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:12<00:11, 273.91 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:13<00:08, 339.61 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:13<00:10, 269.10 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:13<00:09, 281.74 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:14<00:09, 288.40 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:14<00:08, 294.49 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:14<00:08, 297.27 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:15<00:07, 299.46 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:15<00:07, 303.20 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:15<00:06, 307.33 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:16<00:06, 306.08 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:16<00:06, 311.18 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:16<00:05, 313.37 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:17<00:05, 314.26 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:17<00:05, 302.49 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:18<00:04, 295.14 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:18<00:04, 305.15 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:18<00:04, 308.15 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:19<00:03, 292.98 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:19<00:02, 352.53 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:19<00:03, 295.54 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:20<00:02, 291.39 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:20<00:02, 300.00 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:20<00:02, 299.45 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:21<00:01, 293.86 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:21<00:01, 294.29 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:21<00:01, 289.59 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:22<00:00, 350.89 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:22<00:00, 303.06 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:22<00:00, 284.59 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:23<00:00, 241.20 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 811.76 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:03, 1183.29 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:02, 1300.69 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:03<00:01, 1438.62 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1512.21 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1577.41 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1393.64 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1141.23 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1718.35 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2043.47 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2278.22 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2305.47 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2035.59 examples/s]
[2026-01-27 07:48:14,121] [WARNING] [py.warnings._showwarnmsg:109] [PID:61431] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3960/5680 [9:59:08<9:22:55, 19.64s/it]                                                                                                                                                                                                                                             {'loss': '0.3623', 'grad_norm': '0.738', 'learning_rate': '4.198e-05', 'ppl': '1.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '149.3', 'tokens/total': 32440320, 'tokens/trainable': 32067676, 'epoch': '5.116'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 3960/5680 [9:59:08<9:22:55, 19.64s/it][2026-01-27 07:48:22,509] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:61670] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 07:48:23,889] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:61670] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:07<06:31, 14.22 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:08<03:00, 30.35 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:08<01:46, 50.49 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:09<01:13, 71.59 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▌                                                                                                                                             | 530/5677 [00:09<00:54, 95.09 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:10<00:43, 114.85 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:10<00:36, 134.59 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:11<00:33, 146.29 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:11<00:29, 158.82 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:12<00:27, 165.83 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:12<00:26, 168.27 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:13<00:25, 171.99 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:14<00:24, 173.60 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:14<00:21, 191.49 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:15<00:22, 181.33 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:15<00:20, 192.91 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:16<00:20, 187.73 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:16<00:20, 180.18 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:17<00:18, 196.96 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:17<00:17, 201.78 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:18<00:18, 187.64 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:18<00:16, 199.55 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:19<00:16, 195.98 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:20<00:17, 182.76 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:20<00:14, 205.53 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:21<00:15, 194.42 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:21<00:14, 199.52 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:22<00:14, 189.65 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:22<00:13, 187.55 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:23<00:13, 192.06 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:23<00:12, 200.41 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:24<00:11, 200.42 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:24<00:10, 209.13 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:25<00:11, 188.16 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:25<00:10, 198.98 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:26<00:09, 206.26 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:27<00:09, 186.97 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:27<00:08, 191.32 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:27<00:07, 205.28 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:28<00:07, 195.65 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:29<00:07, 193.51 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:29<00:06, 201.36 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:30<00:05, 200.07 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:30<00:05, 192.56 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:31<00:04, 210.78 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:31<00:04, 199.04 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:32<00:03, 191.75 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:32<00:03, 207.41 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:33<00:02, 204.12 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:33<00:02, 206.05 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:34<00:01, 203.25 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:34<00:01, 189.25 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:35<00:00, 195.52 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:35<00:00, 196.15 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:37<00:00, 152.99 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:05, 812.95 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:03, 1054.49 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:02, 1172.33 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:03<00:01, 1250.89 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:04<00:00, 1396.33 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1506.53 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:04<00:00, 1295.95 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:03, 1259.96 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 1913.22 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2345.43 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2591.34 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2625.69 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2292.06 examples/s]
[2026-01-27 07:49:08,193] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:61670] Using single process for pack_parallel, running sequentially.
[2026-01-27 07:49:14,281] [WARNING] [py.warnings._showwarnmsg:109] [PID:61670] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3961/5680 [10:00:08<15:11:11, 31.80s/it]                                                                                                                                                                                                                                             {'loss': '0.5204', 'grad_norm': '0.4445', 'learning_rate': '4.194e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 32448512, 'tokens/trainable': 32075832, 'epoch': '6'}
 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3961/5680 [10:00:08<15:11:11, 31.80s/it] 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3962/5680 [10:00:16<11:46:08, 24.66s/it]                                                                                                                                                                                                                                             {'loss': '0.4545', 'grad_norm': '0.3371', 'learning_rate': '4.189e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32456704, 'tokens/trainable': 32084000, 'epoch': '6'}
 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3962/5680 [10:00:16<11:46:08, 24.66s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3963/5680 [10:00:24<9:22:54, 19.67s/it]                                                                                                                                                                                                                                             {'loss': '0.4413', 'grad_norm': '0.4556', 'learning_rate': '4.185e-05', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 32464896, 'tokens/trainable': 32092168, 'epoch': '6.001'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3963/5680 [10:00:24<9:22:54, 19.67s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3964/5680 [10:00:32<7:42:36, 16.18s/it]                                                                                                                                                                                                                                             {'loss': '0.6723', 'grad_norm': '0.4254', 'learning_rate': '4.18e-05', 'ppl': '1.959', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 32473088, 'tokens/trainable': 32100296, 'epoch': '6.001'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3964/5680 [10:00:32<7:42:36, 16.18s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3965/5680 [10:00:40<6:32:29, 13.73s/it]                                                                                                                                                                                                                                             {'loss': '0.4414', 'grad_norm': '0.4369', 'learning_rate': '4.176e-05', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 32481280, 'tokens/trainable': 32108426, 'epoch': '6.001'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3965/5680 [10:00:40<6:32:29, 13.73s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3966/5680 [10:00:48<5:43:17, 12.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4904', 'grad_norm': '0.3634', 'learning_rate': '4.171e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 32489472, 'tokens/trainable': 32116586, 'epoch': '6.001'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 3966/5680 [10:00:48<5:43:17, 12.02s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3967/5680 [10:00:56<5:08:53, 10.82s/it]                                                                                                                                                                                                                                             {'loss': '0.5901', 'grad_norm': '0.5193', 'learning_rate': '4.167e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32497664, 'tokens/trainable': 32124764, 'epoch': '6.001'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3967/5680 [10:00:56<5:08:53, 10.82s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3968/5680 [10:01:04<4:44:53,  9.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4721', 'grad_norm': '0.5016', 'learning_rate': '4.162e-05', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 32505856, 'tokens/trainable': 32132912, 'epoch': '6.001'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3968/5680 [10:01:04<4:44:53,  9.98s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3969/5680 [10:01:13<4:27:49,  9.39s/it]                                                                                                                                                                                                                                             {'loss': '0.6618', 'grad_norm': '0.4366', 'learning_rate': '4.158e-05', 'ppl': '1.938', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 32514048, 'tokens/trainable': 32141098, 'epoch': '6.002'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3969/5680 [10:01:13<4:27:49,  9.39s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3970/5680 [10:01:21<4:15:45,  8.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6027', 'grad_norm': '0.4565', 'learning_rate': '4.154e-05', 'ppl': '1.827', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32522240, 'tokens/trainable': 32149272, 'epoch': '6.002'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3970/5680 [10:01:21<4:15:45,  8.97s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3971/5680 [10:01:29<4:07:26,  8.69s/it]                                                                                                                                                                                                                                             {'loss': '0.7011', 'grad_norm': '0.4116', 'learning_rate': '4.149e-05', 'ppl': '2.016', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32530432, 'tokens/trainable': 32157462, 'epoch': '6.002'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3971/5680 [10:01:29<4:07:26,  8.69s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3972/5680 [10:01:37<4:01:46,  8.49s/it]                                                                                                                                                                                                                                             {'loss': '0.5082', 'grad_norm': '0.3504', 'learning_rate': '4.145e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 32538624, 'tokens/trainable': 32165608, 'epoch': '6.002'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3972/5680 [10:01:37<4:01:46,  8.49s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3973/5680 [10:01:45<3:57:19,  8.34s/it]                                                                                                                                                                                                                                             {'loss': '0.4923', 'grad_norm': '0.4393', 'learning_rate': '4.14e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32546816, 'tokens/trainable': 32173742, 'epoch': '6.002'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 3973/5680 [10:01:45<3:57:19,  8.34s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3974/5680 [10:01:53<3:54:06,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.2785', 'grad_norm': '0.3308', 'learning_rate': '4.136e-05', 'ppl': '1.321', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32555008, 'tokens/trainable': 32181914, 'epoch': '6.002'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3974/5680 [10:01:53<3:54:06,  8.23s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3975/5680 [10:02:01<3:51:56,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.3198', 'grad_norm': '0.3851', 'learning_rate': '4.131e-05', 'ppl': '1.377', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 32563200, 'tokens/trainable': 32190048, 'epoch': '6.003'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3975/5680 [10:02:01<3:51:56,  8.16s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3976/5680 [10:02:08<3:50:03,  8.10s/it]                                                                                                                                                                                                                                             {'loss': '0.3578', 'grad_norm': '0.4043', 'learning_rate': '4.127e-05', 'ppl': '1.43', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32571392, 'tokens/trainable': 32198190, 'epoch': '6.003'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3976/5680 [10:02:08<3:50:03,  8.10s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3977/5680 [10:02:16<3:48:39,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.5047', 'grad_norm': '0.4503', 'learning_rate': '4.122e-05', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 32579584, 'tokens/trainable': 32206338, 'epoch': '6.003'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 3977/5680 [10:02:16<3:48:39,  8.06s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3978/5680 [10:02:24<3:47:16,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.7013', 'grad_norm': '0.479', 'learning_rate': '4.118e-05', 'ppl': '2.016', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 32587776, 'tokens/trainable': 32214508, 'epoch': '6.003'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3978/5680 [10:02:24<3:47:16,  8.01s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3979/5680 [10:02:32<3:46:49,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4341', 'grad_norm': '0.3425', 'learning_rate': '4.113e-05', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32595968, 'tokens/trainable': 32222672, 'epoch': '6.003'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3979/5680 [10:02:32<3:46:49,  8.00s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3980/5680 [10:02:41<3:49:17,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.486', 'grad_norm': '0.5569', 'learning_rate': '4.109e-05', 'ppl': '1.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.2', 'tokens/total': 32604160, 'tokens/trainable': 32230838, 'epoch': '6.004'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3980/5680 [10:02:41<3:49:17,  8.09s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3981/5680 [10:02:49<3:48:03,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.5319', 'grad_norm': '0.3907', 'learning_rate': '4.104e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32612352, 'tokens/trainable': 32238948, 'epoch': '6.004'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 3981/5680 [10:02:49<3:48:03,  8.05s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 3982/5680 [10:02:57<3:47:05,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.392', 'grad_norm': '0.3273', 'learning_rate': '4.1e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 32620544, 'tokens/trainable': 32247122, 'epoch': '6.004'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 3982/5680 [10:02:57<3:47:05,  8.02s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 3983/5680 [10:03:05<3:46:24,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.4194', 'grad_norm': '0.4094', 'learning_rate': '4.095e-05', 'ppl': '1.521', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32628736, 'tokens/trainable': 32255272, 'epoch': '6.004'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 3983/5680 [10:03:05<3:46:24,  8.00s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 3984/5680 [10:03:12<3:45:53,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5128', 'grad_norm': '0.4555', 'learning_rate': '4.091e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 32636928, 'tokens/trainable': 32263416, 'epoch': '6.004'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 3984/5680 [10:03:12<3:45:53,  7.99s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3985/5680 [10:03:20<3:45:20,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4839', 'grad_norm': '0.3532', 'learning_rate': '4.086e-05', 'ppl': '1.622', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 32645120, 'tokens/trainable': 32271572, 'epoch': '6.004'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3985/5680 [10:03:20<3:45:20,  7.98s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3986/5680 [10:03:28<3:45:31,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.6161', 'grad_norm': '0.5208', 'learning_rate': '4.082e-05', 'ppl': '1.852', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 32653312, 'tokens/trainable': 32279720, 'epoch': '6.005'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3986/5680 [10:03:28<3:45:31,  7.99s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3987/5680 [10:03:36<3:45:18,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5376', 'grad_norm': '0.3775', 'learning_rate': '4.077e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32661504, 'tokens/trainable': 32287852, 'epoch': '6.005'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3987/5680 [10:03:36<3:45:18,  7.98s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3988/5680 [10:03:44<3:45:23,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3782', 'grad_norm': '0.3546', 'learning_rate': '4.073e-05', 'ppl': '1.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32669696, 'tokens/trainable': 32296040, 'epoch': '6.005'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3988/5680 [10:03:44<3:45:23,  7.99s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3989/5680 [10:03:52<3:44:34,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5266', 'grad_norm': '0.3888', 'learning_rate': '4.069e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 32677888, 'tokens/trainable': 32304228, 'epoch': '6.005'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3989/5680 [10:03:52<3:44:34,  7.97s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3990/5680 [10:04:00<3:43:58,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5269', 'grad_norm': '0.393', 'learning_rate': '4.064e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 32686080, 'tokens/trainable': 32312352, 'epoch': '6.005'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3990/5680 [10:04:00<3:43:58,  7.95s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3991/5680 [10:04:08<3:43:54,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6058', 'grad_norm': '0.3802', 'learning_rate': '4.06e-05', 'ppl': '1.833', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32694272, 'tokens/trainable': 32320484, 'epoch': '6.005'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3991/5680 [10:04:08<3:43:54,  7.95s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3992/5680 [10:04:16<3:44:13,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3905', 'grad_norm': '0.3669', 'learning_rate': '4.055e-05', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 32702464, 'tokens/trainable': 32328626, 'epoch': '6.006'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 3992/5680 [10:04:16<3:44:13,  7.97s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3993/5680 [10:04:24<3:43:46,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5065', 'grad_norm': '0.3952', 'learning_rate': '4.051e-05', 'ppl': '1.66', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 32710656, 'tokens/trainable': 32336796, 'epoch': '6.006'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3993/5680 [10:04:24<3:43:46,  7.96s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3994/5680 [10:04:32<3:43:08,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3922', 'grad_norm': '0.3407', 'learning_rate': '4.046e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 32718848, 'tokens/trainable': 32344958, 'epoch': '6.006'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3994/5680 [10:04:32<3:43:08,  7.94s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3995/5680 [10:04:40<3:43:05,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4812', 'grad_norm': '0.3439', 'learning_rate': '4.042e-05', 'ppl': '1.618', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 32727040, 'tokens/trainable': 32353112, 'epoch': '6.006'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3995/5680 [10:04:40<3:43:05,  7.94s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3996/5680 [10:04:48<3:43:13,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.633', 'grad_norm': '0.4057', 'learning_rate': '4.037e-05', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32735232, 'tokens/trainable': 32361276, 'epoch': '6.006'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 3996/5680 [10:04:48<3:43:13,  7.95s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 3997/5680 [10:04:56<3:45:59,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.4852', 'grad_norm': '0.4063', 'learning_rate': '4.033e-05', 'ppl': '1.625', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '980.4', 'tokens/total': 32743424, 'tokens/trainable': 32369408, 'epoch': '6.007'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 3997/5680 [10:04:56<3:45:59,  8.06s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 3998/5680 [10:05:04<3:44:49,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.429', 'grad_norm': '0.4056', 'learning_rate': '4.029e-05', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 32751616, 'tokens/trainable': 32377528, 'epoch': '6.007'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 3998/5680 [10:05:04<3:44:49,  8.02s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 3999/5680 [10:05:12<3:43:54,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7203', 'grad_norm': '0.53', 'learning_rate': '4.024e-05', 'ppl': '2.055', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 32759808, 'tokens/trainable': 32385700, 'epoch': '6.007'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 3999/5680 [10:05:12<3:43:54,  7.99s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                        | 4000/5680 [10:05:20<3:43:26,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5037', 'grad_norm': '0.3754', 'learning_rate': '4.02e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 32768000, 'tokens/trainable': 32393848, 'epoch': '6.007'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                        | 4000/5680 [10:05:20<3:43:26,  7.98s/it][2026-01-27 07:54:34,123] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 07:55:24,928] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out/checkpoint-4000
[2026-01-27 07:56:28,282] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 07:56:28,283] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4001/5680 [10:07:24<19:54:22, 42.68s/it]                                                                                                                                                                                                                                             {'loss': '0.3781', 'grad_norm': '0.3712', 'learning_rate': '4.015e-05', 'ppl': '1.459', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '889', 'tokens/total': 32776192, 'tokens/trainable': 32401980, 'epoch': '6.007'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4001/5680 [10:07:24<19:54:22, 42.68s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4002/5680 [10:07:32<15:02:47, 32.28s/it]                                                                                                                                                                                                                                             {'loss': '0.3113', 'grad_norm': '0.3682', 'learning_rate': '4.011e-05', 'ppl': '1.365', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 32784384, 'tokens/trainable': 32410112, 'epoch': '6.007'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4002/5680 [10:07:32<15:02:47, 32.28s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4003/5680 [10:07:40<11:38:23, 24.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3563', 'grad_norm': '0.4547', 'learning_rate': '4.006e-05', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32792576, 'tokens/trainable': 32418266, 'epoch': '6.008'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4003/5680 [10:07:40<11:38:23, 24.99s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4004/5680 [10:07:48<9:16:10, 19.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4072', 'grad_norm': '0.3387', 'learning_rate': '4.002e-05', 'ppl': '1.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 32800768, 'tokens/trainable': 32426452, 'epoch': '6.008'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4004/5680 [10:07:48<9:16:10, 19.91s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4005/5680 [10:07:56<7:36:03, 16.34s/it]                                                                                                                                                                                                                                             {'loss': '0.364', 'grad_norm': '0.5144', 'learning_rate': '3.998e-05', 'ppl': '1.439', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32808960, 'tokens/trainable': 32434626, 'epoch': '6.008'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4005/5680 [10:07:56<7:36:03, 16.34s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4006/5680 [10:08:04<6:26:21, 13.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3768', 'grad_norm': '0.37', 'learning_rate': '3.993e-05', 'ppl': '1.458', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 32817152, 'tokens/trainable': 32442768, 'epoch': '6.008'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4006/5680 [10:08:04<6:26:21, 13.85s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4007/5680 [10:08:12<5:37:20, 12.10s/it]                                                                                                                                                                                                                                             {'loss': '0.2606', 'grad_norm': '0.3448', 'learning_rate': '3.989e-05', 'ppl': '1.298', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 32825344, 'tokens/trainable': 32450944, 'epoch': '6.008'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 4007/5680 [10:08:12<5:37:20, 12.10s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4008/5680 [10:08:20<5:03:10, 10.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5485', 'grad_norm': '0.3842', 'learning_rate': '3.984e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 32833536, 'tokens/trainable': 32459122, 'epoch': '6.008'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4008/5680 [10:08:20<5:03:10, 10.88s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4009/5680 [10:08:28<4:39:15, 10.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4247', 'grad_norm': '0.4071', 'learning_rate': '3.98e-05', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 32841728, 'tokens/trainable': 32467300, 'epoch': '6.009'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4009/5680 [10:08:28<4:39:15, 10.03s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4010/5680 [10:08:36<4:22:04,  9.42s/it]                                                                                                                                                                                                                                             {'loss': '0.5102', 'grad_norm': '0.4033', 'learning_rate': '3.975e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32849920, 'tokens/trainable': 32475442, 'epoch': '6.009'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 4010/5680 [10:08:36<4:22:04,  9.42s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4011/5680 [10:08:44<4:10:08,  8.99s/it]                                                                                                                                                                                                                                             {'loss': '0.485', 'grad_norm': '0.3953', 'learning_rate': '3.971e-05', 'ppl': '1.624', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32858112, 'tokens/trainable': 32483624, 'epoch': '6.009'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4011/5680 [10:08:44<4:10:08,  8.99s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4012/5680 [10:08:52<4:01:34,  8.69s/it]                                                                                                                                                                                                                                             {'loss': '0.44', 'grad_norm': '0.4144', 'learning_rate': '3.967e-05', 'ppl': '1.553', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32866304, 'tokens/trainable': 32491784, 'epoch': '6.009'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4012/5680 [10:08:52<4:01:34,  8.69s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4013/5680 [10:09:00<3:55:35,  8.48s/it]                                                                                                                                                                                                                                             {'loss': '0.6439', 'grad_norm': '0.4685', 'learning_rate': '3.962e-05', 'ppl': '1.904', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 32874496, 'tokens/trainable': 32499960, 'epoch': '6.009'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4013/5680 [10:09:00<3:55:35,  8.48s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4014/5680 [10:09:08<3:51:38,  8.34s/it]                                                                                                                                                                                                                                             {'loss': '0.3391', 'grad_norm': '0.4156', 'learning_rate': '3.958e-05', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 32882688, 'tokens/trainable': 32508138, 'epoch': '6.01'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 4014/5680 [10:09:08<3:51:38,  8.34s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4015/5680 [10:09:16<3:48:38,  8.24s/it]                                                                                                                                                                                                                                             {'loss': '0.7951', 'grad_norm': '0.5083', 'learning_rate': '3.953e-05', 'ppl': '2.215', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 32890880, 'tokens/trainable': 32516302, 'epoch': '6.01'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4015/5680 [10:09:16<3:48:38,  8.24s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4016/5680 [10:09:24<3:45:53,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.5593', 'grad_norm': '0.4245', 'learning_rate': '3.949e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 32899072, 'tokens/trainable': 32524448, 'epoch': '6.01'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4016/5680 [10:09:24<3:45:53,  8.15s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4017/5680 [10:09:32<3:44:10,  8.09s/it]                                                                                                                                                                                                                                             {'loss': '0.5911', 'grad_norm': '0.4727', 'learning_rate': '3.945e-05', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 32907264, 'tokens/trainable': 32532596, 'epoch': '6.01'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4017/5680 [10:09:32<3:44:10,  8.09s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4018/5680 [10:09:40<3:43:12,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.7295', 'grad_norm': '0.4228', 'learning_rate': '3.94e-05', 'ppl': '2.074', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 32915456, 'tokens/trainable': 32540760, 'epoch': '6.01'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 4018/5680 [10:09:40<3:43:12,  8.06s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4019/5680 [10:09:48<3:42:15,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.453', 'grad_norm': '0.3854', 'learning_rate': '3.936e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 32923648, 'tokens/trainable': 32548948, 'epoch': '6.01'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4019/5680 [10:09:48<3:42:15,  8.03s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4020/5680 [10:09:56<3:41:51,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4644', 'grad_norm': '0.4414', 'learning_rate': '3.931e-05', 'ppl': '1.591', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 32931840, 'tokens/trainable': 32557092, 'epoch': '6.011'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4020/5680 [10:09:56<3:41:51,  8.02s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4021/5680 [10:10:04<3:40:38,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4028', 'grad_norm': '0.3675', 'learning_rate': '3.927e-05', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 32940032, 'tokens/trainable': 32565280, 'epoch': '6.011'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4021/5680 [10:10:04<3:40:38,  7.98s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4022/5680 [10:10:12<3:40:02,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4699', 'grad_norm': '0.404', 'learning_rate': '3.923e-05', 'ppl': '1.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 32948224, 'tokens/trainable': 32573418, 'epoch': '6.011'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 4022/5680 [10:10:12<3:40:02,  7.96s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 4023/5680 [10:10:19<3:39:57,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5848', 'grad_norm': '0.4532', 'learning_rate': '3.918e-05', 'ppl': '1.795', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32956416, 'tokens/trainable': 32581574, 'epoch': '6.011'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 4023/5680 [10:10:19<3:39:57,  7.96s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 4024/5680 [10:10:27<3:39:54,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5356', 'grad_norm': '0.4756', 'learning_rate': '3.914e-05', 'ppl': '1.708', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 32964608, 'tokens/trainable': 32589724, 'epoch': '6.011'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 4024/5680 [10:10:27<3:39:54,  7.97s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 4025/5680 [10:10:36<3:42:41,  8.07s/it]                                                                                                                                                                                                                                             {'loss': '0.3374', 'grad_norm': '0.3427', 'learning_rate': '3.909e-05', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '976.5', 'tokens/total': 32972800, 'tokens/trainable': 32597848, 'epoch': '6.011'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 4025/5680 [10:10:36<3:42:41,  8.07s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4026/5680 [10:10:44<3:41:01,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4097', 'grad_norm': '0.4092', 'learning_rate': '3.905e-05', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 32980992, 'tokens/trainable': 32605992, 'epoch': '6.012'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4026/5680 [10:10:44<3:41:01,  8.02s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4027/5680 [10:10:52<3:40:12,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.7328', 'grad_norm': '0.5169', 'learning_rate': '3.901e-05', 'ppl': '2.081', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 32989184, 'tokens/trainable': 32614156, 'epoch': '6.012'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4027/5680 [10:10:52<3:40:12,  7.99s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4028/5680 [10:11:00<3:39:29,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6714', 'grad_norm': '0.4102', 'learning_rate': '3.896e-05', 'ppl': '1.957', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 32997376, 'tokens/trainable': 32622268, 'epoch': '6.012'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4028/5680 [10:11:00<3:39:29,  7.97s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4029/5680 [10:11:07<3:38:46,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6069', 'grad_norm': '0.4231', 'learning_rate': '3.892e-05', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33005568, 'tokens/trainable': 32630404, 'epoch': '6.012'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 4029/5680 [10:11:07<3:38:46,  7.95s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4030/5680 [10:11:15<3:38:15,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.7342', 'grad_norm': '0.4083', 'learning_rate': '3.888e-05', 'ppl': '2.084', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33013760, 'tokens/trainable': 32638568, 'epoch': '6.012'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4030/5680 [10:11:15<3:38:15,  7.94s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4031/5680 [10:11:23<3:37:57,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3726', 'grad_norm': '0.4695', 'learning_rate': '3.883e-05', 'ppl': '1.452', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33021952, 'tokens/trainable': 32646712, 'epoch': '6.013'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4031/5680 [10:11:23<3:37:57,  7.93s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4032/5680 [10:11:31<3:37:50,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4714', 'grad_norm': '0.4287', 'learning_rate': '3.879e-05', 'ppl': '1.602', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33030144, 'tokens/trainable': 32654894, 'epoch': '6.013'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4032/5680 [10:11:31<3:37:50,  7.93s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4033/5680 [10:11:39<3:37:48,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4317', 'grad_norm': '0.3853', 'learning_rate': '3.874e-05', 'ppl': '1.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 33038336, 'tokens/trainable': 32663084, 'epoch': '6.013'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 4033/5680 [10:11:39<3:37:48,  7.93s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 4034/5680 [10:11:47<3:37:46,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4396', 'grad_norm': '0.3385', 'learning_rate': '3.87e-05', 'ppl': '1.552', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 33046528, 'tokens/trainable': 32671200, 'epoch': '6.013'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 4034/5680 [10:11:47<3:37:46,  7.94s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 4035/5680 [10:11:55<3:37:33,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5231', 'grad_norm': '0.3874', 'learning_rate': '3.866e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33054720, 'tokens/trainable': 32679328, 'epoch': '6.013'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 4035/5680 [10:11:55<3:37:33,  7.94s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 4036/5680 [10:12:03<3:37:46,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3979', 'grad_norm': '0.4719', 'learning_rate': '3.861e-05', 'ppl': '1.489', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 33062912, 'tokens/trainable': 32687488, 'epoch': '6.013'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 4036/5680 [10:12:03<3:37:46,  7.95s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4037/5680 [10:12:11<3:37:37,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2754', 'grad_norm': '0.3104', 'learning_rate': '3.857e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33071104, 'tokens/trainable': 32695628, 'epoch': '6.014'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4037/5680 [10:12:11<3:37:37,  7.95s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4038/5680 [10:12:19<3:37:24,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.418', 'grad_norm': '0.4706', 'learning_rate': '3.853e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 33079296, 'tokens/trainable': 32703768, 'epoch': '6.014'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4038/5680 [10:12:19<3:37:24,  7.94s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4039/5680 [10:12:27<3:37:32,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3102', 'grad_norm': '0.3618', 'learning_rate': '3.848e-05', 'ppl': '1.364', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 33087488, 'tokens/trainable': 32711878, 'epoch': '6.014'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4039/5680 [10:12:27<3:37:32,  7.95s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4040/5680 [10:12:35<3:37:43,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6681', 'grad_norm': '0.4028', 'learning_rate': '3.844e-05', 'ppl': '1.951', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 33095680, 'tokens/trainable': 32720060, 'epoch': '6.014'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 4040/5680 [10:12:35<3:37:43,  7.97s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4041/5680 [10:12:43<3:37:07,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.7465', 'grad_norm': '0.4253', 'learning_rate': '3.84e-05', 'ppl': '2.11', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33103872, 'tokens/trainable': 32728220, 'epoch': '6.014'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4041/5680 [10:12:43<3:37:07,  7.95s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4042/5680 [10:12:51<3:37:02,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5681', 'grad_norm': '0.4299', 'learning_rate': '3.835e-05', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33112064, 'tokens/trainable': 32736404, 'epoch': '6.014'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4042/5680 [10:12:51<3:37:02,  7.95s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4043/5680 [10:12:59<3:36:39,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5387', 'grad_norm': '0.4063', 'learning_rate': '3.831e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 33120256, 'tokens/trainable': 32744524, 'epoch': '6.015'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4043/5680 [10:12:59<3:36:39,  7.94s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4044/5680 [10:13:07<3:36:15,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.429', 'grad_norm': '0.4408', 'learning_rate': '3.826e-05', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 33128448, 'tokens/trainable': 32752700, 'epoch': '6.015'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 4044/5680 [10:13:07<3:36:15,  7.93s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4045/5680 [10:13:15<3:38:48,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.5189', 'grad_norm': '0.4275', 'learning_rate': '3.822e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.3', 'tokens/total': 33136640, 'tokens/trainable': 32760770, 'epoch': '6.015'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4045/5680 [10:13:15<3:38:48,  8.03s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4046/5680 [10:13:23<3:37:50,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.2559', 'grad_norm': '0.3372', 'learning_rate': '3.818e-05', 'ppl': '1.292', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 33144832, 'tokens/trainable': 32768906, 'epoch': '6.015'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4046/5680 [10:13:23<3:37:50,  8.00s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4047/5680 [10:13:31<3:36:59,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4706', 'grad_norm': '0.4251', 'learning_rate': '3.813e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 33153024, 'tokens/trainable': 32777040, 'epoch': '6.015'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4047/5680 [10:13:31<3:36:59,  7.97s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4048/5680 [10:13:39<3:36:32,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5434', 'grad_norm': '0.4148', 'learning_rate': '3.809e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33161216, 'tokens/trainable': 32785172, 'epoch': '6.015'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 4048/5680 [10:13:39<3:36:32,  7.96s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 4049/5680 [10:13:46<3:36:04,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5027', 'grad_norm': '0.3708', 'learning_rate': '3.805e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33169408, 'tokens/trainable': 32793348, 'epoch': '6.016'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 4049/5680 [10:13:46<3:36:04,  7.95s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 4050/5680 [10:13:54<3:35:52,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4159', 'grad_norm': '0.3853', 'learning_rate': '3.8e-05', 'ppl': '1.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33177600, 'tokens/trainable': 32801536, 'epoch': '6.016'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 4050/5680 [10:13:54<3:35:52,  7.95s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 4051/5680 [10:14:02<3:35:17,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3403', 'grad_norm': '0.3598', 'learning_rate': '3.796e-05', 'ppl': '1.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33185792, 'tokens/trainable': 32809676, 'epoch': '6.016'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                      | 4051/5680 [10:14:02<3:35:17,  7.93s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4052/5680 [10:14:10<3:35:00,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5217', 'grad_norm': '0.4091', 'learning_rate': '3.792e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 33193984, 'tokens/trainable': 32817800, 'epoch': '6.016'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4052/5680 [10:14:10<3:35:00,  7.92s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4053/5680 [10:14:18<3:34:43,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5875', 'grad_norm': '0.4092', 'learning_rate': '3.787e-05', 'ppl': '1.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33202176, 'tokens/trainable': 32825956, 'epoch': '6.016'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4053/5680 [10:14:18<3:34:43,  7.92s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4054/5680 [10:14:26<3:34:38,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5613', 'grad_norm': '0.3983', 'learning_rate': '3.783e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33210368, 'tokens/trainable': 32834144, 'epoch': '6.017'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4054/5680 [10:14:26<3:34:38,  7.92s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4055/5680 [10:14:34<3:34:39,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.526', 'grad_norm': '0.434', 'learning_rate': '3.779e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 33218560, 'tokens/trainable': 32842268, 'epoch': '6.017'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 4055/5680 [10:14:34<3:34:39,  7.93s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4056/5680 [10:14:42<3:34:43,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5353', 'grad_norm': '0.3747', 'learning_rate': '3.774e-05', 'ppl': '1.708', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 33226752, 'tokens/trainable': 32850442, 'epoch': '6.017'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4056/5680 [10:14:42<3:34:43,  7.93s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4057/5680 [10:14:50<3:34:28,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3529', 'grad_norm': '0.4639', 'learning_rate': '3.77e-05', 'ppl': '1.423', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 33234944, 'tokens/trainable': 32858614, 'epoch': '6.017'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4057/5680 [10:14:50<3:34:28,  7.93s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4058/5680 [10:14:58<3:34:19,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.401', 'grad_norm': '0.391', 'learning_rate': '3.766e-05', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 33243136, 'tokens/trainable': 32866710, 'epoch': '6.017'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4058/5680 [10:14:58<3:34:19,  7.93s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4059/5680 [10:15:06<3:33:54,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5381', 'grad_norm': '0.3747', 'learning_rate': '3.761e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 33251328, 'tokens/trainable': 32874900, 'epoch': '6.017'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 4059/5680 [10:15:06<3:33:54,  7.92s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 4060/5680 [10:15:14<3:34:22,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.459', 'grad_norm': '0.4297', 'learning_rate': '3.757e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 33259520, 'tokens/trainable': 32883078, 'epoch': '6.018'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 4060/5680 [10:15:14<3:34:22,  7.94s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 4061/5680 [10:15:22<3:34:39,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4363', 'grad_norm': '0.423', 'learning_rate': '3.753e-05', 'ppl': '1.547', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 33267712, 'tokens/trainable': 32891204, 'epoch': '6.018'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 4061/5680 [10:15:22<3:34:39,  7.96s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 4062/5680 [10:15:30<3:34:44,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.8631', 'grad_norm': '0.4801', 'learning_rate': '3.748e-05', 'ppl': '2.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 33275904, 'tokens/trainable': 32899342, 'epoch': '6.018'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 4062/5680 [10:15:30<3:34:44,  7.96s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4063/5680 [10:15:38<3:34:36,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4305', 'grad_norm': '0.443', 'learning_rate': '3.744e-05', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33284096, 'tokens/trainable': 32907532, 'epoch': '6.018'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4063/5680 [10:15:38<3:34:36,  7.96s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4064/5680 [10:15:46<3:34:09,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4046', 'grad_norm': '0.3822', 'learning_rate': '3.74e-05', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33292288, 'tokens/trainable': 32915656, 'epoch': '6.018'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4064/5680 [10:15:46<3:34:09,  7.95s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4065/5680 [10:15:53<3:33:42,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4969', 'grad_norm': '0.4024', 'learning_rate': '3.735e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33300480, 'tokens/trainable': 32923796, 'epoch': '6.018'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4065/5680 [10:15:53<3:33:42,  7.94s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4066/5680 [10:16:01<3:33:33,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6351', 'grad_norm': '0.3964', 'learning_rate': '3.731e-05', 'ppl': '1.887', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 33308672, 'tokens/trainable': 32931916, 'epoch': '6.019'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 4066/5680 [10:16:01<3:33:33,  7.94s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4067/5680 [10:16:09<3:33:51,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4228', 'grad_norm': '0.4177', 'learning_rate': '3.727e-05', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 33316864, 'tokens/trainable': 32940086, 'epoch': '6.019'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4067/5680 [10:16:09<3:33:51,  7.96s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4068/5680 [10:16:17<3:33:35,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6288', 'grad_norm': '0.4416', 'learning_rate': '3.723e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33325056, 'tokens/trainable': 32948256, 'epoch': '6.019'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4068/5680 [10:16:17<3:33:35,  7.95s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4069/5680 [10:16:25<3:32:51,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6148', 'grad_norm': '0.4122', 'learning_rate': '3.718e-05', 'ppl': '1.849', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 33333248, 'tokens/trainable': 32956352, 'epoch': '6.019'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4069/5680 [10:16:25<3:32:51,  7.93s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4070/5680 [10:16:33<3:33:19,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.655', 'grad_norm': '0.4022', 'learning_rate': '3.714e-05', 'ppl': '1.925', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 33341440, 'tokens/trainable': 32964446, 'epoch': '6.019'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 4070/5680 [10:16:33<3:33:19,  7.95s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4071/5680 [10:16:41<3:33:33,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4705', 'grad_norm': '0.4345', 'learning_rate': '3.71e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 33349632, 'tokens/trainable': 32972548, 'epoch': '6.02'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4071/5680 [10:16:41<3:33:33,  7.96s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4072/5680 [10:16:49<3:33:40,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.6961', 'grad_norm': '0.4578', 'learning_rate': '3.705e-05', 'ppl': '2.006', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 33357824, 'tokens/trainable': 32980686, 'epoch': '6.02'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4072/5680 [10:16:49<3:33:40,  7.97s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4073/5680 [10:16:57<3:33:39,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.5846', 'grad_norm': '0.3998', 'learning_rate': '3.701e-05', 'ppl': '1.794', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 33366016, 'tokens/trainable': 32988802, 'epoch': '6.02'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4073/5680 [10:16:57<3:33:39,  7.98s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4074/5680 [10:17:05<3:33:44,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4109', 'grad_norm': '0.4477', 'learning_rate': '3.697e-05', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 33374208, 'tokens/trainable': 32996968, 'epoch': '6.02'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 4074/5680 [10:17:05<3:33:44,  7.99s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 4075/5680 [10:17:13<3:33:24,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4344', 'grad_norm': '0.4068', 'learning_rate': '3.692e-05', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 33382400, 'tokens/trainable': 33005100, 'epoch': '6.02'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 4075/5680 [10:17:13<3:33:24,  7.98s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 4076/5680 [10:17:21<3:33:10,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.3971', 'grad_norm': '0.3746', 'learning_rate': '3.688e-05', 'ppl': '1.487', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33390592, 'tokens/trainable': 33013266, 'epoch': '6.02'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 4076/5680 [10:17:21<3:33:10,  7.97s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 4077/5680 [10:17:29<3:33:00,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.488', 'grad_norm': '0.3977', 'learning_rate': '3.684e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 33398784, 'tokens/trainable': 33021440, 'epoch': '6.021'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 4077/5680 [10:17:29<3:33:00,  7.97s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4078/5680 [10:17:37<3:33:02,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.8006', 'grad_norm': '0.4693', 'learning_rate': '3.68e-05', 'ppl': '2.227', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 33406976, 'tokens/trainable': 33029584, 'epoch': '6.021'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4078/5680 [10:17:37<3:33:02,  7.98s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4079/5680 [10:17:45<3:32:39,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.4745', 'grad_norm': '0.4346', 'learning_rate': '3.675e-05', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33415168, 'tokens/trainable': 33037760, 'epoch': '6.021'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4079/5680 [10:17:45<3:32:39,  7.97s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4080/5680 [10:17:53<3:34:48,  8.06s/it]                                                                                                                                                                                                                                             {'loss': '0.5457', 'grad_norm': '0.4048', 'learning_rate': '3.671e-05', 'ppl': '1.726', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.5', 'tokens/total': 33423360, 'tokens/trainable': 33045884, 'epoch': '6.021'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4080/5680 [10:17:53<3:34:48,  8.06s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4081/5680 [10:18:01<3:33:41,  8.02s/it]                                                                                                                                                                                                                                             {'loss': '0.5241', 'grad_norm': '0.4311', 'learning_rate': '3.667e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33431552, 'tokens/trainable': 33054052, 'epoch': '6.021'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 4081/5680 [10:18:01<3:33:41,  8.02s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4082/5680 [10:18:09<3:32:55,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5175', 'grad_norm': '0.3913', 'learning_rate': '3.662e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33439744, 'tokens/trainable': 33062224, 'epoch': '6.021'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4082/5680 [10:18:09<3:32:55,  7.99s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4083/5680 [10:18:17<3:32:22,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.389', 'grad_norm': '0.4418', 'learning_rate': '3.658e-05', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 33447936, 'tokens/trainable': 33070358, 'epoch': '6.022'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4083/5680 [10:18:17<3:32:22,  7.98s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4084/5680 [10:18:25<3:31:40,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.6236', 'grad_norm': '0.432', 'learning_rate': '3.654e-05', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 33456128, 'tokens/trainable': 33078538, 'epoch': '6.022'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4084/5680 [10:18:25<3:31:40,  7.96s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4085/5680 [10:18:33<3:31:21,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3403', 'grad_norm': '0.3663', 'learning_rate': '3.65e-05', 'ppl': '1.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 33464320, 'tokens/trainable': 33086676, 'epoch': '6.022'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 4085/5680 [10:18:33<3:31:21,  7.95s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4086/5680 [10:18:41<3:31:03,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.503', 'grad_norm': '0.3589', 'learning_rate': '3.645e-05', 'ppl': '1.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 33472512, 'tokens/trainable': 33094772, 'epoch': '6.022'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4086/5680 [10:18:41<3:31:03,  7.94s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4087/5680 [10:18:49<3:31:00,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2153', 'grad_norm': '0.3771', 'learning_rate': '3.641e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 33480704, 'tokens/trainable': 33102942, 'epoch': '6.022'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4087/5680 [10:18:49<3:31:00,  7.95s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4088/5680 [10:18:57<3:30:42,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3624', 'grad_norm': '0.4537', 'learning_rate': '3.637e-05', 'ppl': '1.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33488896, 'tokens/trainable': 33111062, 'epoch': '6.023'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4088/5680 [10:18:57<3:30:42,  7.94s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4089/5680 [10:19:05<3:30:40,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3406', 'grad_norm': '0.3995', 'learning_rate': '3.633e-05', 'ppl': '1.406', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 33497088, 'tokens/trainable': 33119180, 'epoch': '6.023'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 4089/5680 [10:19:05<3:30:40,  7.94s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 4090/5680 [10:19:13<3:30:23,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3452', 'grad_norm': '0.3661', 'learning_rate': '3.628e-05', 'ppl': '1.412', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33505280, 'tokens/trainable': 33127332, 'epoch': '6.023'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 4090/5680 [10:19:13<3:30:23,  7.94s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 4091/5680 [10:19:21<3:29:58,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3828', 'grad_norm': '0.387', 'learning_rate': '3.624e-05', 'ppl': '1.466', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 33513472, 'tokens/trainable': 33135520, 'epoch': '6.023'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 4091/5680 [10:19:21<3:29:58,  7.93s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 4092/5680 [10:19:28<3:29:27,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5016', 'grad_norm': '0.4375', 'learning_rate': '3.62e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 33521664, 'tokens/trainable': 33143696, 'epoch': '6.023'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 4092/5680 [10:19:28<3:29:27,  7.91s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4093/5680 [10:19:36<3:29:29,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3521', 'grad_norm': '0.4423', 'learning_rate': '3.616e-05', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 33529856, 'tokens/trainable': 33151852, 'epoch': '6.023'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4093/5680 [10:19:36<3:29:29,  7.92s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4094/5680 [10:19:44<3:29:37,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4544', 'grad_norm': '0.4134', 'learning_rate': '3.611e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33538048, 'tokens/trainable': 33160000, 'epoch': '6.024'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4094/5680 [10:19:44<3:29:37,  7.93s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4095/5680 [10:19:52<3:29:28,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3714', 'grad_norm': '0.394', 'learning_rate': '3.607e-05', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33546240, 'tokens/trainable': 33168156, 'epoch': '6.024'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4095/5680 [10:19:52<3:29:28,  7.93s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4096/5680 [10:20:00<3:31:34,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4182', 'grad_norm': '0.369', 'learning_rate': '3.603e-05', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.2', 'tokens/total': 33554432, 'tokens/trainable': 33176244, 'epoch': '6.024'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 4096/5680 [10:20:00<3:31:34,  8.01s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4097/5680 [10:20:08<3:30:31,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.3606', 'grad_norm': '0.4025', 'learning_rate': '3.599e-05', 'ppl': '1.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33562624, 'tokens/trainable': 33184372, 'epoch': '6.024'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4097/5680 [10:20:08<3:30:31,  7.98s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4098/5680 [10:20:16<3:29:58,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5882', 'grad_norm': '0.5554', 'learning_rate': '3.594e-05', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33570816, 'tokens/trainable': 33192532, 'epoch': '6.024'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4098/5680 [10:20:16<3:29:58,  7.96s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4099/5680 [10:20:24<3:29:25,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4927', 'grad_norm': '0.4378', 'learning_rate': '3.59e-05', 'ppl': '1.637', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33579008, 'tokens/trainable': 33200704, 'epoch': '6.024'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4099/5680 [10:20:24<3:29:25,  7.95s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4100/5680 [10:20:32<3:29:05,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6085', 'grad_norm': '0.5329', 'learning_rate': '3.586e-05', 'ppl': '1.838', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33587200, 'tokens/trainable': 33208888, 'epoch': '6.025'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 4100/5680 [10:20:32<3:29:05,  7.94s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 4101/5680 [10:20:40<3:28:43,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5902', 'grad_norm': '0.4355', 'learning_rate': '3.582e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 33595392, 'tokens/trainable': 33216960, 'epoch': '6.025'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 4101/5680 [10:20:40<3:28:43,  7.93s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 4102/5680 [10:20:48<3:28:21,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5486', 'grad_norm': '0.3947', 'learning_rate': '3.577e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 33603584, 'tokens/trainable': 33225136, 'epoch': '6.025'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 4102/5680 [10:20:48<3:28:21,  7.92s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 4103/5680 [10:20:56<3:28:08,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.346', 'grad_norm': '0.3177', 'learning_rate': '3.573e-05', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 33611776, 'tokens/trainable': 33233326, 'epoch': '6.025'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 4103/5680 [10:20:56<3:28:08,  7.92s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4104/5680 [10:21:04<3:27:55,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4164', 'grad_norm': '0.3997', 'learning_rate': '3.569e-05', 'ppl': '1.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 33619968, 'tokens/trainable': 33241444, 'epoch': '6.025'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4104/5680 [10:21:04<3:27:55,  7.92s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4105/5680 [10:21:12<3:27:43,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.498', 'grad_norm': '0.4076', 'learning_rate': '3.565e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 33628160, 'tokens/trainable': 33249618, 'epoch': '6.026'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4105/5680 [10:21:12<3:27:43,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4106/5680 [10:21:19<3:27:20,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4701', 'grad_norm': '0.3915', 'learning_rate': '3.56e-05', 'ppl': '1.6', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 33636352, 'tokens/trainable': 33257780, 'epoch': '6.026'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4106/5680 [10:21:19<3:27:20,  7.90s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4107/5680 [10:21:27<3:27:07,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3266', 'grad_norm': '0.3578', 'learning_rate': '3.556e-05', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 33644544, 'tokens/trainable': 33265942, 'epoch': '6.026'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 4107/5680 [10:21:27<3:27:07,  7.90s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4108/5680 [10:21:35<3:27:12,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5691', 'grad_norm': '0.404', 'learning_rate': '3.552e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 33652736, 'tokens/trainable': 33274116, 'epoch': '6.026'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4108/5680 [10:21:35<3:27:12,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4109/5680 [10:21:43<3:27:00,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5212', 'grad_norm': '0.4459', 'learning_rate': '3.548e-05', 'ppl': '1.684', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 33660928, 'tokens/trainable': 33282218, 'epoch': '6.026'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4109/5680 [10:21:43<3:27:00,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4110/5680 [10:21:51<3:26:53,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4521', 'grad_norm': '0.3798', 'learning_rate': '3.543e-05', 'ppl': '1.572', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33669120, 'tokens/trainable': 33290356, 'epoch': '6.026'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4110/5680 [10:21:51<3:26:53,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4111/5680 [10:21:59<3:26:52,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4125', 'grad_norm': '0.3898', 'learning_rate': '3.539e-05', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33677312, 'tokens/trainable': 33298502, 'epoch': '6.027'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 4111/5680 [10:21:59<3:26:52,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4112/5680 [10:22:07<3:26:52,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4665', 'grad_norm': '0.4183', 'learning_rate': '3.535e-05', 'ppl': '1.594', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 33685504, 'tokens/trainable': 33306612, 'epoch': '6.027'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4112/5680 [10:22:07<3:26:52,  7.92s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4113/5680 [10:22:15<3:26:58,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4355', 'grad_norm': '0.416', 'learning_rate': '3.531e-05', 'ppl': '1.546', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33693696, 'tokens/trainable': 33314790, 'epoch': '6.027'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4113/5680 [10:22:15<3:26:58,  7.92s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4114/5680 [10:22:23<3:26:42,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4754', 'grad_norm': '0.402', 'learning_rate': '3.527e-05', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 33701888, 'tokens/trainable': 33322884, 'epoch': '6.027'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4114/5680 [10:22:23<3:26:42,  7.92s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4115/5680 [10:22:31<3:26:37,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3808', 'grad_norm': '0.3879', 'learning_rate': '3.522e-05', 'ppl': '1.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 33710080, 'tokens/trainable': 33331056, 'epoch': '6.027'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                    | 4115/5680 [10:22:31<3:26:37,  7.92s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 4116/5680 [10:22:39<3:26:17,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6437', 'grad_norm': '0.5248', 'learning_rate': '3.518e-05', 'ppl': '1.903', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 33718272, 'tokens/trainable': 33339224, 'epoch': '6.027'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 4116/5680 [10:22:39<3:26:17,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 4117/5680 [10:22:47<3:25:56,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3257', 'grad_norm': '0.3383', 'learning_rate': '3.514e-05', 'ppl': '1.385', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33726464, 'tokens/trainable': 33347368, 'epoch': '6.028'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 4117/5680 [10:22:47<3:25:56,  7.91s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 4118/5680 [10:22:54<3:25:44,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3538', 'grad_norm': '0.4136', 'learning_rate': '3.51e-05', 'ppl': '1.424', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 33734656, 'tokens/trainable': 33355538, 'epoch': '6.028'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 4118/5680 [10:22:54<3:25:44,  7.90s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4119/5680 [10:23:02<3:25:13,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5581', 'grad_norm': '0.4328', 'learning_rate': '3.506e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 33742848, 'tokens/trainable': 33363698, 'epoch': '6.028'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4119/5680 [10:23:02<3:25:13,  7.89s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4120/5680 [10:23:10<3:24:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6298', 'grad_norm': '0.4939', 'learning_rate': '3.501e-05', 'ppl': '1.877', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 33751040, 'tokens/trainable': 33371826, 'epoch': '6.028'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4120/5680 [10:23:10<3:24:39,  7.87s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4121/5680 [10:23:18<3:24:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4319', 'grad_norm': '0.4021', 'learning_rate': '3.497e-05', 'ppl': '1.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 33759232, 'tokens/trainable': 33379996, 'epoch': '6.028'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4121/5680 [10:23:18<3:24:15,  7.86s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4122/5680 [10:23:26<3:24:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6344', 'grad_norm': '0.4558', 'learning_rate': '3.493e-05', 'ppl': '1.886', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 33767424, 'tokens/trainable': 33388160, 'epoch': '6.029'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                    | 4122/5680 [10:23:26<3:24:03,  7.86s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4123/5680 [10:23:34<3:24:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5547', 'grad_norm': '0.4555', 'learning_rate': '3.489e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33775616, 'tokens/trainable': 33396288, 'epoch': '6.029'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4123/5680 [10:23:34<3:24:00,  7.86s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4124/5680 [10:23:42<3:24:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3913', 'grad_norm': '0.4002', 'learning_rate': '3.485e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 33783808, 'tokens/trainable': 33404408, 'epoch': '6.029'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4124/5680 [10:23:42<3:24:24,  7.88s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4125/5680 [10:23:49<3:24:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.378', 'grad_norm': '0.4112', 'learning_rate': '3.48e-05', 'ppl': '1.459', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 33792000, 'tokens/trainable': 33412560, 'epoch': '6.029'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4125/5680 [10:23:49<3:24:18,  7.88s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4126/5680 [10:23:57<3:23:58,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2666', 'grad_norm': '0.3293', 'learning_rate': '3.476e-05', 'ppl': '1.305', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 33800192, 'tokens/trainable': 33420696, 'epoch': '6.029'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 4126/5680 [10:23:57<3:23:58,  7.88s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 4127/5680 [10:24:05<3:24:11,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.378', 'grad_norm': '0.3721', 'learning_rate': '3.472e-05', 'ppl': '1.459', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 33808384, 'tokens/trainable': 33428844, 'epoch': '6.029'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 4127/5680 [10:24:05<3:24:11,  7.89s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 4128/5680 [10:24:13<3:23:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3362', 'grad_norm': '0.3715', 'learning_rate': '3.468e-05', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33816576, 'tokens/trainable': 33436968, 'epoch': '6.03'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 4128/5680 [10:24:13<3:23:52,  7.88s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 4129/5680 [10:24:21<3:23:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4096', 'grad_norm': '0.4707', 'learning_rate': '3.464e-05', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 33824768, 'tokens/trainable': 33445148, 'epoch': '6.03'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 4129/5680 [10:24:21<3:23:20,  7.87s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4130/5680 [10:24:29<3:23:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.574', 'grad_norm': '0.4787', 'learning_rate': '3.459e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 33832960, 'tokens/trainable': 33453298, 'epoch': '6.03'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4130/5680 [10:24:29<3:23:14,  7.87s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4131/5680 [10:24:37<3:23:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2393', 'grad_norm': '0.445', 'learning_rate': '3.455e-05', 'ppl': '1.27', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 33841152, 'tokens/trainable': 33461428, 'epoch': '6.03'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4131/5680 [10:24:37<3:23:15,  7.87s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4132/5680 [10:24:45<3:23:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3419', 'grad_norm': '0.343', 'learning_rate': '3.451e-05', 'ppl': '1.408', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 33849344, 'tokens/trainable': 33469582, 'epoch': '6.03'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4132/5680 [10:24:45<3:23:24,  7.88s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4133/5680 [10:24:53<3:23:20,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2825', 'grad_norm': '0.3657', 'learning_rate': '3.447e-05', 'ppl': '1.326', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 33857536, 'tokens/trainable': 33477748, 'epoch': '6.03'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 4133/5680 [10:24:53<3:23:20,  7.89s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4134/5680 [10:25:00<3:22:57,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2967', 'grad_norm': '0.3959', 'learning_rate': '3.443e-05', 'ppl': '1.345', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 33865728, 'tokens/trainable': 33485900, 'epoch': '6.031'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4134/5680 [10:25:00<3:22:57,  7.88s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4135/5680 [10:25:08<3:22:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4708', 'grad_norm': '0.3716', 'learning_rate': '3.438e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 33873920, 'tokens/trainable': 33494014, 'epoch': '6.031'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4135/5680 [10:25:08<3:22:17,  7.86s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4136/5680 [10:25:16<3:22:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5629', 'grad_norm': '0.4463', 'learning_rate': '3.434e-05', 'ppl': '1.756', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 33882112, 'tokens/trainable': 33502168, 'epoch': '6.031'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4136/5680 [10:25:16<3:22:03,  7.85s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4137/5680 [10:25:24<3:22:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3324', 'grad_norm': '0.3361', 'learning_rate': '3.43e-05', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 33890304, 'tokens/trainable': 33510324, 'epoch': '6.031'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 4137/5680 [10:25:24<3:22:04,  7.86s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4138/5680 [10:25:32<3:22:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3675', 'grad_norm': '0.3959', 'learning_rate': '3.426e-05', 'ppl': '1.444', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 33898496, 'tokens/trainable': 33518444, 'epoch': '6.031'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4138/5680 [10:25:32<3:22:19,  7.87s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4139/5680 [10:25:40<3:24:21,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4663', 'grad_norm': '0.4098', 'learning_rate': '3.422e-05', 'ppl': '1.594', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 33906688, 'tokens/trainable': 33526572, 'epoch': '6.032'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4139/5680 [10:25:40<3:24:21,  7.96s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4140/5680 [10:25:48<3:23:25,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4234', 'grad_norm': '0.4615', 'learning_rate': '3.418e-05', 'ppl': '1.527', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 33914880, 'tokens/trainable': 33534660, 'epoch': '6.032'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4140/5680 [10:25:48<3:23:25,  7.93s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4141/5680 [10:25:56<3:22:31,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5949', 'grad_norm': '0.4907', 'learning_rate': '3.413e-05', 'ppl': '1.813', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 33923072, 'tokens/trainable': 33542792, 'epoch': '6.032'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 4141/5680 [10:25:56<3:22:31,  7.90s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 4142/5680 [10:26:04<3:22:22,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6807', 'grad_norm': '0.4687', 'learning_rate': '3.409e-05', 'ppl': '1.975', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 33931264, 'tokens/trainable': 33550876, 'epoch': '6.032'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 4142/5680 [10:26:04<3:22:22,  7.90s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 4143/5680 [10:26:11<3:21:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2732', 'grad_norm': '0.3192', 'learning_rate': '3.405e-05', 'ppl': '1.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 33939456, 'tokens/trainable': 33559000, 'epoch': '6.032'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 4143/5680 [10:26:11<3:21:33,  7.87s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 4144/5680 [10:26:19<3:21:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4545', 'grad_norm': '0.4086', 'learning_rate': '3.401e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33947648, 'tokens/trainable': 33567140, 'epoch': '6.032'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 4144/5680 [10:26:19<3:21:32,  7.87s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4145/5680 [10:26:27<3:20:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6594', 'grad_norm': '0.4192', 'learning_rate': '3.397e-05', 'ppl': '1.934', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1049', 'tokens/total': 33955840, 'tokens/trainable': 33575312, 'epoch': '6.033'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4145/5680 [10:26:27<3:20:46,  7.85s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4146/5680 [10:26:35<3:20:32,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5814', 'grad_norm': '0.426', 'learning_rate': '3.393e-05', 'ppl': '1.789', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 33964032, 'tokens/trainable': 33583444, 'epoch': '6.033'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4146/5680 [10:26:35<3:20:32,  7.84s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4147/5680 [10:26:43<3:20:18,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4014', 'grad_norm': '0.4191', 'learning_rate': '3.389e-05', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 33972224, 'tokens/trainable': 33591624, 'epoch': '6.033'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4147/5680 [10:26:43<3:20:18,  7.84s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4148/5680 [10:26:51<3:20:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3855', 'grad_norm': '0.3913', 'learning_rate': '3.384e-05', 'ppl': '1.47', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 33980416, 'tokens/trainable': 33599760, 'epoch': '6.033'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 4148/5680 [10:26:51<3:20:27,  7.85s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4149/5680 [10:26:58<3:20:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3891', 'grad_norm': '0.462', 'learning_rate': '3.38e-05', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 33988608, 'tokens/trainable': 33607928, 'epoch': '6.033'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4149/5680 [10:26:58<3:20:19,  7.85s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4150/5680 [10:27:06<3:20:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4527', 'grad_norm': '0.4612', 'learning_rate': '3.376e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 33996800, 'tokens/trainable': 33616064, 'epoch': '6.033'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4150/5680 [10:27:06<3:20:23,  7.86s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4151/5680 [10:27:14<3:22:04,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.358', 'grad_norm': '0.3839', 'learning_rate': '3.372e-05', 'ppl': '1.43', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.5', 'tokens/total': 34004992, 'tokens/trainable': 33624128, 'epoch': '6.034'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4151/5680 [10:27:14<3:22:04,  7.93s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4152/5680 [10:27:22<3:21:30,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.7588', 'grad_norm': '0.4564', 'learning_rate': '3.368e-05', 'ppl': '2.136', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 34013184, 'tokens/trainable': 33632240, 'epoch': '6.034'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 4152/5680 [10:27:22<3:21:30,  7.91s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 4153/5680 [10:27:30<3:20:49,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5776', 'grad_norm': '0.4252', 'learning_rate': '3.364e-05', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34021376, 'tokens/trainable': 33640384, 'epoch': '6.034'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 4153/5680 [10:27:30<3:20:49,  7.89s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 4154/5680 [10:27:38<3:20:33,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3832', 'grad_norm': '0.3803', 'learning_rate': '3.36e-05', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 34029568, 'tokens/trainable': 33648504, 'epoch': '6.034'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 4154/5680 [10:27:38<3:20:33,  7.89s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 4155/5680 [10:27:46<3:20:31,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5757', 'grad_norm': '0.4565', 'learning_rate': '3.355e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34037760, 'tokens/trainable': 33656648, 'epoch': '6.034'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 4155/5680 [10:27:46<3:20:31,  7.89s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4156/5680 [10:27:54<3:20:22,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4333', 'grad_norm': '0.4322', 'learning_rate': '3.351e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34045952, 'tokens/trainable': 33664836, 'epoch': '6.035'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4156/5680 [10:27:54<3:20:22,  7.89s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4157/5680 [10:28:02<3:20:06,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3416', 'grad_norm': '0.3952', 'learning_rate': '3.347e-05', 'ppl': '1.407', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34054144, 'tokens/trainable': 33673008, 'epoch': '6.035'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4157/5680 [10:28:02<3:20:06,  7.88s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4158/5680 [10:28:09<3:19:47,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4972', 'grad_norm': '0.4535', 'learning_rate': '3.343e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 34062336, 'tokens/trainable': 33681080, 'epoch': '6.035'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4158/5680 [10:28:09<3:19:47,  7.88s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4159/5680 [10:28:17<3:19:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3444', 'grad_norm': '0.3775', 'learning_rate': '3.339e-05', 'ppl': '1.411', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34070528, 'tokens/trainable': 33689212, 'epoch': '6.035'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 4159/5680 [10:28:17<3:19:24,  7.87s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4160/5680 [10:28:25<3:18:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7302', 'grad_norm': '0.413', 'learning_rate': '3.335e-05', 'ppl': '2.075', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34078720, 'tokens/trainable': 33697328, 'epoch': '6.035'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4160/5680 [10:28:25<3:18:53,  7.85s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4161/5680 [10:28:33<3:18:34,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5532', 'grad_norm': '0.4282', 'learning_rate': '3.331e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 34086912, 'tokens/trainable': 33705508, 'epoch': '6.035'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4161/5680 [10:28:33<3:18:34,  7.84s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4162/5680 [10:28:41<3:18:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.408', 'grad_norm': '0.4254', 'learning_rate': '3.327e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34095104, 'tokens/trainable': 33713652, 'epoch': '6.036'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4162/5680 [10:28:41<3:18:35,  7.85s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4163/5680 [10:28:49<3:18:18,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6093', 'grad_norm': '0.3625', 'learning_rate': '3.322e-05', 'ppl': '1.839', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34103296, 'tokens/trainable': 33721780, 'epoch': '6.036'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 4163/5680 [10:28:49<3:18:18,  7.84s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4164/5680 [10:28:57<3:18:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.354', 'grad_norm': '0.4541', 'learning_rate': '3.318e-05', 'ppl': '1.425', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34111488, 'tokens/trainable': 33729920, 'epoch': '6.036'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4164/5680 [10:28:57<3:18:16,  7.85s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4165/5680 [10:29:04<3:18:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4354', 'grad_norm': '0.3672', 'learning_rate': '3.314e-05', 'ppl': '1.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34119680, 'tokens/trainable': 33738080, 'epoch': '6.036'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4165/5680 [10:29:04<3:18:26,  7.86s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4166/5680 [10:29:12<3:18:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.645', 'grad_norm': '0.4227', 'learning_rate': '3.31e-05', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 34127872, 'tokens/trainable': 33746252, 'epoch': '6.036'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4166/5680 [10:29:12<3:18:03,  7.85s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4167/5680 [10:29:20<3:18:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3854', 'grad_norm': '0.3633', 'learning_rate': '3.306e-05', 'ppl': '1.47', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34136064, 'tokens/trainable': 33754432, 'epoch': '6.036'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 4167/5680 [10:29:20<3:18:03,  7.85s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 4168/5680 [10:29:28<3:17:52,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5917', 'grad_norm': '0.482', 'learning_rate': '3.302e-05', 'ppl': '1.807', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 34144256, 'tokens/trainable': 33762608, 'epoch': '6.037'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 4168/5680 [10:29:28<3:17:52,  7.85s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 4169/5680 [10:29:36<3:17:47,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3918', 'grad_norm': '0.402', 'learning_rate': '3.298e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34152448, 'tokens/trainable': 33770776, 'epoch': '6.037'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 4169/5680 [10:29:36<3:17:47,  7.85s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 4170/5680 [10:29:44<3:17:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3677', 'grad_norm': '0.3984', 'learning_rate': '3.294e-05', 'ppl': '1.444', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34160640, 'tokens/trainable': 33778956, 'epoch': '6.037'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 4170/5680 [10:29:44<3:17:54,  7.86s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4171/5680 [10:29:52<3:17:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.449', 'grad_norm': '0.3874', 'learning_rate': '3.29e-05', 'ppl': '1.567', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34168832, 'tokens/trainable': 33787096, 'epoch': '6.037'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4171/5680 [10:29:52<3:17:47,  7.86s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4172/5680 [10:29:59<3:17:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3537', 'grad_norm': '0.3976', 'learning_rate': '3.285e-05', 'ppl': '1.424', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34177024, 'tokens/trainable': 33795256, 'epoch': '6.037'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4172/5680 [10:29:59<3:17:49,  7.87s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4173/5680 [10:30:07<3:17:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.469', 'grad_norm': '0.4758', 'learning_rate': '3.281e-05', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34185216, 'tokens/trainable': 33803416, 'epoch': '6.037'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4173/5680 [10:30:07<3:17:45,  7.87s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4174/5680 [10:30:15<3:17:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8114', 'grad_norm': '0.473', 'learning_rate': '3.277e-05', 'ppl': '2.251', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34193408, 'tokens/trainable': 33811540, 'epoch': '6.038'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 4174/5680 [10:30:15<3:17:26,  7.87s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4175/5680 [10:30:23<3:16:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7384', 'grad_norm': '0.4358', 'learning_rate': '3.273e-05', 'ppl': '2.093', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34201600, 'tokens/trainable': 33819644, 'epoch': '6.038'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4175/5680 [10:30:23<3:16:50,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4176/5680 [10:30:31<3:16:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6109', 'grad_norm': '0.4196', 'learning_rate': '3.269e-05', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34209792, 'tokens/trainable': 33827784, 'epoch': '6.038'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4176/5680 [10:30:31<3:16:44,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4177/5680 [10:30:39<3:16:40,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3722', 'grad_norm': '0.3381', 'learning_rate': '3.265e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 34217984, 'tokens/trainable': 33835968, 'epoch': '6.038'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4177/5680 [10:30:39<3:16:40,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4178/5680 [10:30:47<3:16:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3926', 'grad_norm': '0.4068', 'learning_rate': '3.261e-05', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34226176, 'tokens/trainable': 33844092, 'epoch': '6.038'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 4178/5680 [10:30:47<3:16:30,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 4179/5680 [10:30:54<3:16:13,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6802', 'grad_norm': '0.4114', 'learning_rate': '3.257e-05', 'ppl': '1.974', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34234368, 'tokens/trainable': 33852240, 'epoch': '6.039'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 4179/5680 [10:30:54<3:16:13,  7.84s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 4180/5680 [10:31:02<3:16:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5696', 'grad_norm': '0.4171', 'learning_rate': '3.253e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34242560, 'tokens/trainable': 33860420, 'epoch': '6.039'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 4180/5680 [10:31:02<3:16:15,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 4181/5680 [10:31:10<3:16:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5127', 'grad_norm': '0.3431', 'learning_rate': '3.249e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34250752, 'tokens/trainable': 33868572, 'epoch': '6.039'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 4181/5680 [10:31:10<3:16:16,  7.86s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4182/5680 [10:31:18<3:16:05,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6233', 'grad_norm': '0.4443', 'learning_rate': '3.245e-05', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34258944, 'tokens/trainable': 33876708, 'epoch': '6.039'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4182/5680 [10:31:18<3:16:05,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4183/5680 [10:31:26<3:15:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5698', 'grad_norm': '0.3602', 'learning_rate': '3.24e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34267136, 'tokens/trainable': 33884832, 'epoch': '6.039'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4183/5680 [10:31:26<3:15:48,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4184/5680 [10:31:34<3:15:28,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.577', 'grad_norm': '0.4547', 'learning_rate': '3.236e-05', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 34275328, 'tokens/trainable': 33893000, 'epoch': '6.039'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4184/5680 [10:31:34<3:15:28,  7.84s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4185/5680 [10:31:41<3:15:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.689', 'grad_norm': '0.4425', 'learning_rate': '3.232e-05', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34283520, 'tokens/trainable': 33901180, 'epoch': '6.04'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 4185/5680 [10:31:41<3:15:38,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4186/5680 [10:31:49<3:15:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4627', 'grad_norm': '0.428', 'learning_rate': '3.228e-05', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34291712, 'tokens/trainable': 33909324, 'epoch': '6.04'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4186/5680 [10:31:49<3:15:27,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4187/5680 [10:31:57<3:15:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6239', 'grad_norm': '0.4505', 'learning_rate': '3.224e-05', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 34299904, 'tokens/trainable': 33917504, 'epoch': '6.04'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4187/5680 [10:31:57<3:15:22,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4188/5680 [10:32:05<3:15:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4313', 'grad_norm': '0.3748', 'learning_rate': '3.22e-05', 'ppl': '1.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 34308096, 'tokens/trainable': 33925692, 'epoch': '6.04'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4188/5680 [10:32:05<3:15:10,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4189/5680 [10:32:13<3:14:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3097', 'grad_norm': '0.3348', 'learning_rate': '3.216e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34316288, 'tokens/trainable': 33933784, 'epoch': '6.04'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 4189/5680 [10:32:13<3:14:58,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4190/5680 [10:32:21<3:14:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3285', 'grad_norm': '0.3843', 'learning_rate': '3.212e-05', 'ppl': '1.389', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34324480, 'tokens/trainable': 33941956, 'epoch': '6.04'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4190/5680 [10:32:21<3:14:53,  7.85s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4191/5680 [10:32:29<3:15:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5504', 'grad_norm': '0.3888', 'learning_rate': '3.208e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 34332672, 'tokens/trainable': 33950120, 'epoch': '6.041'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4191/5680 [10:32:29<3:15:25,  7.87s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4192/5680 [10:32:37<3:15:46,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3273', 'grad_norm': '0.3879', 'learning_rate': '3.204e-05', 'ppl': '1.387', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 34340864, 'tokens/trainable': 33958228, 'epoch': '6.041'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4192/5680 [10:32:37<3:15:46,  7.89s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4193/5680 [10:32:45<3:15:51,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.562', 'grad_norm': '0.4493', 'learning_rate': '3.2e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 34349056, 'tokens/trainable': 33966348, 'epoch': '6.041'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 4193/5680 [10:32:45<3:15:51,  7.90s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 4194/5680 [10:32:52<3:15:30,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3432', 'grad_norm': '0.4196', 'learning_rate': '3.196e-05', 'ppl': '1.41', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34357248, 'tokens/trainable': 33974500, 'epoch': '6.041'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 4194/5680 [10:32:52<3:15:30,  7.89s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 4195/5680 [10:33:00<3:15:09,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4831', 'grad_norm': '0.4142', 'learning_rate': '3.192e-05', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34365440, 'tokens/trainable': 33982656, 'epoch': '6.041'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 4195/5680 [10:33:00<3:15:09,  7.89s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 4196/5680 [10:33:08<3:14:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.454', 'grad_norm': '0.4413', 'learning_rate': '3.188e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34373632, 'tokens/trainable': 33990812, 'epoch': '6.042'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 4196/5680 [10:33:08<3:14:56,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4197/5680 [10:33:16<3:14:59,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3353', 'grad_norm': '0.3854', 'learning_rate': '3.184e-05', 'ppl': '1.398', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 34381824, 'tokens/trainable': 33998912, 'epoch': '6.042'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4197/5680 [10:33:16<3:14:59,  7.89s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4198/5680 [10:33:24<3:14:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.601', 'grad_norm': '0.4027', 'learning_rate': '3.18e-05', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34390016, 'tokens/trainable': 34007048, 'epoch': '6.042'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4198/5680 [10:33:24<3:14:23,  7.87s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4199/5680 [10:33:32<3:13:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5425', 'grad_norm': '0.3781', 'learning_rate': '3.176e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34398208, 'tokens/trainable': 34015172, 'epoch': '6.042'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4199/5680 [10:33:32<3:13:51,  7.85s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4200/5680 [10:33:40<3:13:40,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4556', 'grad_norm': '0.4308', 'learning_rate': '3.171e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34406400, 'tokens/trainable': 34023336, 'epoch': '6.042'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 4200/5680 [10:33:40<3:13:40,  7.85s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4201/5680 [10:33:47<3:14:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5467', 'grad_norm': '0.5273', 'learning_rate': '3.167e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 34414592, 'tokens/trainable': 34031488, 'epoch': '6.042'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4201/5680 [10:33:47<3:14:04,  7.87s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4202/5680 [10:33:55<3:13:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4791', 'grad_norm': '0.4546', 'learning_rate': '3.163e-05', 'ppl': '1.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34422784, 'tokens/trainable': 34039656, 'epoch': '6.043'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4202/5680 [10:33:55<3:13:56,  7.87s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4203/5680 [10:34:03<3:15:33,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4947', 'grad_norm': '0.3742', 'learning_rate': '3.159e-05', 'ppl': '1.64', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 34430976, 'tokens/trainable': 34047800, 'epoch': '6.043'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4203/5680 [10:34:03<3:15:33,  7.94s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4204/5680 [10:34:11<3:15:06,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3516', 'grad_norm': '0.4042', 'learning_rate': '3.155e-05', 'ppl': '1.421', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34439168, 'tokens/trainable': 34055976, 'epoch': '6.043'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 4204/5680 [10:34:11<3:15:06,  7.93s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 4205/5680 [10:34:19<3:14:24,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4625', 'grad_norm': '0.4003', 'learning_rate': '3.151e-05', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34447360, 'tokens/trainable': 34064088, 'epoch': '6.043'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 4205/5680 [10:34:19<3:14:24,  7.91s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 4206/5680 [10:34:27<3:13:54,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.543', 'grad_norm': '0.4137', 'learning_rate': '3.147e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34455552, 'tokens/trainable': 34072220, 'epoch': '6.043'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 4206/5680 [10:34:27<3:13:54,  7.89s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 4207/5680 [10:34:35<3:13:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.585', 'grad_norm': '0.5356', 'learning_rate': '3.143e-05', 'ppl': '1.795', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 34463744, 'tokens/trainable': 34080384, 'epoch': '6.043'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 4207/5680 [10:34:35<3:13:20,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4208/5680 [10:34:43<3:13:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3862', 'grad_norm': '0.4364', 'learning_rate': '3.139e-05', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34471936, 'tokens/trainable': 34088568, 'epoch': '6.044'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4208/5680 [10:34:43<3:13:09,  7.87s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4209/5680 [10:34:51<3:12:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5429', 'grad_norm': '0.3904', 'learning_rate': '3.135e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34480128, 'tokens/trainable': 34096720, 'epoch': '6.044'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4209/5680 [10:34:51<3:12:57,  7.87s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4210/5680 [10:34:59<3:15:01,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3827', 'grad_norm': '0.3683', 'learning_rate': '3.131e-05', 'ppl': '1.466', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 34488320, 'tokens/trainable': 34104860, 'epoch': '6.044'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4210/5680 [10:34:59<3:15:01,  7.96s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4211/5680 [10:35:07<3:14:18,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4194', 'grad_norm': '0.4295', 'learning_rate': '3.127e-05', 'ppl': '1.521', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34496512, 'tokens/trainable': 34112988, 'epoch': '6.044'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4211/5680 [10:35:07<3:14:18,  7.94s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4212/5680 [10:35:15<3:13:57,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.423', 'grad_norm': '0.4519', 'learning_rate': '3.123e-05', 'ppl': '1.527', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34504704, 'tokens/trainable': 34121152, 'epoch': '6.044'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4212/5680 [10:35:15<3:13:57,  7.93s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4213/5680 [10:35:22<3:13:36,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5319', 'grad_norm': '0.4033', 'learning_rate': '3.119e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34512896, 'tokens/trainable': 34129300, 'epoch': '6.045'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4213/5680 [10:35:22<3:13:36,  7.92s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4214/5680 [10:35:30<3:13:03,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.288', 'grad_norm': '0.3441', 'learning_rate': '3.115e-05', 'ppl': '1.334', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34521088, 'tokens/trainable': 34137424, 'epoch': '6.045'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4214/5680 [10:35:30<3:13:03,  7.90s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4215/5680 [10:35:38<3:12:31,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4685', 'grad_norm': '0.388', 'learning_rate': '3.111e-05', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34529280, 'tokens/trainable': 34145584, 'epoch': '6.045'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 4215/5680 [10:35:38<3:12:31,  7.89s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4216/5680 [10:35:46<3:12:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5219', 'grad_norm': '0.5183', 'learning_rate': '3.107e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34537472, 'tokens/trainable': 34153760, 'epoch': '6.045'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4216/5680 [10:35:46<3:12:19,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4217/5680 [10:35:54<3:12:04,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2907', 'grad_norm': '0.3406', 'learning_rate': '3.103e-05', 'ppl': '1.337', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34545664, 'tokens/trainable': 34161912, 'epoch': '6.045'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4217/5680 [10:35:54<3:12:04,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4218/5680 [10:36:02<3:12:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3783', 'grad_norm': '0.3591', 'learning_rate': '3.099e-05', 'ppl': '1.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 34553856, 'tokens/trainable': 34170052, 'epoch': '6.045'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4218/5680 [10:36:02<3:12:05,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4219/5680 [10:36:10<3:11:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4953', 'grad_norm': '0.4234', 'learning_rate': '3.095e-05', 'ppl': '1.641', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34562048, 'tokens/trainable': 34178184, 'epoch': '6.046'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 4219/5680 [10:36:10<3:11:37,  7.87s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 4220/5680 [10:36:18<3:11:44,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.677', 'grad_norm': '0.4579', 'learning_rate': '3.091e-05', 'ppl': '1.968', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 34570240, 'tokens/trainable': 34186280, 'epoch': '6.046'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 4220/5680 [10:36:18<3:11:44,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 4221/5680 [10:36:25<3:11:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3777', 'grad_norm': '0.4561', 'learning_rate': '3.087e-05', 'ppl': '1.459', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34578432, 'tokens/trainable': 34194460, 'epoch': '6.046'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 4221/5680 [10:36:25<3:11:37,  7.88s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 4222/5680 [10:36:33<3:11:39,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.385', 'grad_norm': '0.4914', 'learning_rate': '3.083e-05', 'ppl': '1.47', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34586624, 'tokens/trainable': 34202636, 'epoch': '6.046'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 4222/5680 [10:36:33<3:11:39,  7.89s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4223/5680 [10:36:41<3:11:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3785', 'grad_norm': '0.3716', 'learning_rate': '3.079e-05', 'ppl': '1.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 34594816, 'tokens/trainable': 34210808, 'epoch': '6.046'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4223/5680 [10:36:41<3:11:02,  7.87s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4224/5680 [10:36:49<3:10:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4385', 'grad_norm': '0.389', 'learning_rate': '3.075e-05', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34603008, 'tokens/trainable': 34218980, 'epoch': '6.046'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4224/5680 [10:36:49<3:10:49,  7.86s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4225/5680 [10:36:57<3:10:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6134', 'grad_norm': '0.4054', 'learning_rate': '3.071e-05', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34611200, 'tokens/trainable': 34227128, 'epoch': '6.047'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4225/5680 [10:36:57<3:10:33,  7.86s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4226/5680 [10:37:05<3:10:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4893', 'grad_norm': '0.4302', 'learning_rate': '3.067e-05', 'ppl': '1.631', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34619392, 'tokens/trainable': 34235260, 'epoch': '6.047'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 4226/5680 [10:37:05<3:10:11,  7.85s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4227/5680 [10:37:13<3:10:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2852', 'grad_norm': '0.3903', 'learning_rate': '3.063e-05', 'ppl': '1.33', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34627584, 'tokens/trainable': 34243416, 'epoch': '6.047'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4227/5680 [10:37:13<3:10:11,  7.85s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4228/5680 [10:37:20<3:10:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3919', 'grad_norm': '0.3577', 'learning_rate': '3.059e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34635776, 'tokens/trainable': 34251564, 'epoch': '6.047'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4228/5680 [10:37:20<3:10:18,  7.86s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4229/5680 [10:37:28<3:10:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5531', 'grad_norm': '0.4485', 'learning_rate': '3.055e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34643968, 'tokens/trainable': 34259736, 'epoch': '6.047'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4229/5680 [10:37:28<3:10:14,  7.87s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4230/5680 [10:37:36<3:10:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3548', 'grad_norm': '0.3742', 'learning_rate': '3.051e-05', 'ppl': '1.426', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34652160, 'tokens/trainable': 34267880, 'epoch': '6.048'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 4230/5680 [10:37:36<3:10:15,  7.87s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 4231/5680 [10:37:44<3:09:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.427', 'grad_norm': '0.3739', 'learning_rate': '3.047e-05', 'ppl': '1.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 34660352, 'tokens/trainable': 34275984, 'epoch': '6.048'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 4231/5680 [10:37:44<3:09:53,  7.86s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 4232/5680 [10:37:52<3:10:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.411', 'grad_norm': '0.4632', 'learning_rate': '3.043e-05', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34668544, 'tokens/trainable': 34284176, 'epoch': '6.048'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 4232/5680 [10:37:52<3:10:00,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 4233/5680 [10:38:00<3:09:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4477', 'grad_norm': '0.3616', 'learning_rate': '3.039e-05', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 34676736, 'tokens/trainable': 34292352, 'epoch': '6.048'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 4233/5680 [10:38:00<3:09:42,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4234/5680 [10:38:08<3:09:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.687', 'grad_norm': '0.4992', 'learning_rate': '3.035e-05', 'ppl': '1.988', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34684928, 'tokens/trainable': 34300508, 'epoch': '6.048'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4234/5680 [10:38:08<3:09:42,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4235/5680 [10:38:16<3:09:43,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4815', 'grad_norm': '0.4752', 'learning_rate': '3.031e-05', 'ppl': '1.618', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 34693120, 'tokens/trainable': 34308644, 'epoch': '6.048'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4235/5680 [10:38:16<3:09:43,  7.88s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4236/5680 [10:38:23<3:09:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.281', 'grad_norm': '0.3043', 'learning_rate': '3.027e-05', 'ppl': '1.324', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 34701312, 'tokens/trainable': 34316752, 'epoch': '6.049'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4236/5680 [10:38:23<3:09:40,  7.88s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4237/5680 [10:38:31<3:09:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4358', 'grad_norm': '0.3499', 'learning_rate': '3.023e-05', 'ppl': '1.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34709504, 'tokens/trainable': 34324868, 'epoch': '6.049'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 4237/5680 [10:38:31<3:09:22,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4238/5680 [10:38:39<3:09:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4915', 'grad_norm': '0.3769', 'learning_rate': '3.019e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34717696, 'tokens/trainable': 34332984, 'epoch': '6.049'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4238/5680 [10:38:39<3:09:09,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4239/5680 [10:38:47<3:09:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.273', 'grad_norm': '0.3413', 'learning_rate': '3.015e-05', 'ppl': '1.314', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 34725888, 'tokens/trainable': 34341152, 'epoch': '6.049'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4239/5680 [10:38:47<3:09:04,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4240/5680 [10:38:55<3:08:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3409', 'grad_norm': '0.3818', 'learning_rate': '3.012e-05', 'ppl': '1.406', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34734080, 'tokens/trainable': 34349296, 'epoch': '6.049'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4240/5680 [10:38:55<3:08:53,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4241/5680 [10:39:03<3:08:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2614', 'grad_norm': '0.4412', 'learning_rate': '3.008e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34742272, 'tokens/trainable': 34357472, 'epoch': '6.049'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 4241/5680 [10:39:03<3:08:40,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4242/5680 [10:39:11<3:08:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.368', 'grad_norm': '0.429', 'learning_rate': '3.004e-05', 'ppl': '1.445', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 34750464, 'tokens/trainable': 34365568, 'epoch': '6.05'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4242/5680 [10:39:11<3:08:33,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4243/5680 [10:39:19<3:08:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3601', 'grad_norm': '0.3841', 'learning_rate': '3e-05', 'ppl': '1.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34758656, 'tokens/trainable': 34373736, 'epoch': '6.05'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4243/5680 [10:39:19<3:08:36,  7.87s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4244/5680 [10:39:26<3:08:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.8812', 'grad_norm': '0.4929', 'learning_rate': '2.996e-05', 'ppl': '2.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34766848, 'tokens/trainable': 34381864, 'epoch': '6.05'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4244/5680 [10:39:26<3:08:29,  7.88s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4245/5680 [10:39:34<3:08:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3684', 'grad_norm': '0.3532', 'learning_rate': '2.992e-05', 'ppl': '1.445', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34775040, 'tokens/trainable': 34389996, 'epoch': '6.05'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 4245/5680 [10:39:34<3:08:21,  7.88s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 4246/5680 [10:39:42<3:07:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3157', 'grad_norm': '0.33', 'learning_rate': '2.988e-05', 'ppl': '1.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 34783232, 'tokens/trainable': 34398176, 'epoch': '6.05'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 4246/5680 [10:39:42<3:07:55,  7.86s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 4247/5680 [10:39:50<3:07:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5114', 'grad_norm': '0.4674', 'learning_rate': '2.984e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34791424, 'tokens/trainable': 34406344, 'epoch': '6.051'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 4247/5680 [10:39:50<3:07:40,  7.86s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 4248/5680 [10:39:58<3:07:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4467', 'grad_norm': '0.3797', 'learning_rate': '2.98e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34799616, 'tokens/trainable': 34414488, 'epoch': '6.051'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 4248/5680 [10:39:58<3:07:27,  7.85s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4249/5680 [10:40:06<3:07:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6535', 'grad_norm': '0.4753', 'learning_rate': '2.976e-05', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34807808, 'tokens/trainable': 34422604, 'epoch': '6.051'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4249/5680 [10:40:06<3:07:24,  7.86s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4250/5680 [10:40:13<3:07:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6775', 'grad_norm': '0.4647', 'learning_rate': '2.972e-05', 'ppl': '1.969', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34816000, 'tokens/trainable': 34430744, 'epoch': '6.051'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4250/5680 [10:40:13<3:07:04,  7.85s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4251/5680 [10:40:21<3:07:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6094', 'grad_norm': '0.4123', 'learning_rate': '2.968e-05', 'ppl': '1.839', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34824192, 'tokens/trainable': 34438928, 'epoch': '6.051'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4251/5680 [10:40:21<3:07:04,  7.85s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4252/5680 [10:40:29<3:07:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6929', 'grad_norm': '0.5059', 'learning_rate': '2.964e-05', 'ppl': '2', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34832384, 'tokens/trainable': 34447120, 'epoch': '6.051'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 4252/5680 [10:40:29<3:07:07,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4253/5680 [10:40:37<3:07:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4419', 'grad_norm': '0.3764', 'learning_rate': '2.96e-05', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34840576, 'tokens/trainable': 34455248, 'epoch': '6.052'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4253/5680 [10:40:37<3:07:04,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4254/5680 [10:40:45<3:06:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5774', 'grad_norm': '0.4238', 'learning_rate': '2.956e-05', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 34848768, 'tokens/trainable': 34463376, 'epoch': '6.052'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4254/5680 [10:40:45<3:06:55,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4255/5680 [10:40:53<3:06:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4621', 'grad_norm': '0.356', 'learning_rate': '2.952e-05', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34856960, 'tokens/trainable': 34471500, 'epoch': '6.052'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4255/5680 [10:40:53<3:06:39,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4256/5680 [10:41:01<3:06:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3886', 'grad_norm': '0.3795', 'learning_rate': '2.949e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 34865152, 'tokens/trainable': 34479608, 'epoch': '6.052'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 4256/5680 [10:41:01<3:06:36,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4257/5680 [10:41:09<3:06:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4098', 'grad_norm': '0.4279', 'learning_rate': '2.945e-05', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 34873344, 'tokens/trainable': 34487772, 'epoch': '6.052'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4257/5680 [10:41:09<3:06:35,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4258/5680 [10:41:16<3:06:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.454', 'grad_norm': '0.4112', 'learning_rate': '2.941e-05', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34881536, 'tokens/trainable': 34495892, 'epoch': '6.052'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4258/5680 [10:41:16<3:06:17,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4259/5680 [10:41:24<3:06:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3732', 'grad_norm': '0.3538', 'learning_rate': '2.937e-05', 'ppl': '1.452', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 34889728, 'tokens/trainable': 34504016, 'epoch': '6.053'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4259/5680 [10:41:24<3:06:13,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4260/5680 [10:41:32<3:06:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5698', 'grad_norm': '0.6288', 'learning_rate': '2.933e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34897920, 'tokens/trainable': 34512196, 'epoch': '6.053'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4260/5680 [10:41:32<3:06:12,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4261/5680 [10:41:40<3:06:01,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4836', 'grad_norm': '0.38', 'learning_rate': '2.929e-05', 'ppl': '1.622', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34906112, 'tokens/trainable': 34520376, 'epoch': '6.053'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4261/5680 [10:41:40<3:06:01,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4262/5680 [10:41:48<3:05:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4124', 'grad_norm': '0.4506', 'learning_rate': '2.925e-05', 'ppl': '1.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 34914304, 'tokens/trainable': 34528496, 'epoch': '6.053'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4262/5680 [10:41:48<3:05:43,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4263/5680 [10:41:56<3:05:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7878', 'grad_norm': '0.4612', 'learning_rate': '2.921e-05', 'ppl': '2.199', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 34922496, 'tokens/trainable': 34536640, 'epoch': '6.053'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 4263/5680 [10:41:56<3:05:42,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4264/5680 [10:42:04<3:05:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3341', 'grad_norm': '0.3345', 'learning_rate': '2.917e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34930688, 'tokens/trainable': 34544828, 'epoch': '6.054'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4264/5680 [10:42:04<3:05:37,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4265/5680 [10:42:11<3:05:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.409', 'grad_norm': '0.3635', 'learning_rate': '2.913e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34938880, 'tokens/trainable': 34553000, 'epoch': '6.054'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4265/5680 [10:42:11<3:05:24,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4266/5680 [10:42:19<3:05:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4015', 'grad_norm': '0.4264', 'learning_rate': '2.909e-05', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 34947072, 'tokens/trainable': 34561160, 'epoch': '6.054'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4266/5680 [10:42:19<3:05:37,  7.88s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4267/5680 [10:42:27<3:05:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4596', 'grad_norm': '0.3636', 'learning_rate': '2.906e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 34955264, 'tokens/trainable': 34569340, 'epoch': '6.054'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 4267/5680 [10:42:27<3:05:29,  7.88s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4268/5680 [10:42:35<3:05:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4979', 'grad_norm': '0.4306', 'learning_rate': '2.902e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 34963456, 'tokens/trainable': 34577504, 'epoch': '6.054'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4268/5680 [10:42:35<3:05:30,  7.88s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4269/5680 [10:42:43<3:05:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4607', 'grad_norm': '0.4534', 'learning_rate': '2.898e-05', 'ppl': '1.585', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 34971648, 'tokens/trainable': 34585680, 'epoch': '6.054'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4269/5680 [10:42:43<3:05:20,  7.88s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4270/5680 [10:42:51<3:05:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4693', 'grad_norm': '0.4349', 'learning_rate': '2.894e-05', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 34979840, 'tokens/trainable': 34593848, 'epoch': '6.055'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4270/5680 [10:42:51<3:05:02,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4271/5680 [10:42:59<3:04:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6472', 'grad_norm': '0.4263', 'learning_rate': '2.89e-05', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 34988032, 'tokens/trainable': 34601920, 'epoch': '6.055'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 4271/5680 [10:42:59<3:04:53,  7.87s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 4272/5680 [10:43:07<3:04:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.427', 'grad_norm': '0.3926', 'learning_rate': '2.886e-05', 'ppl': '1.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 34996224, 'tokens/trainable': 34610080, 'epoch': '6.055'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 4272/5680 [10:43:07<3:04:32,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 4273/5680 [10:43:14<3:04:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4556', 'grad_norm': '0.4193', 'learning_rate': '2.882e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35004416, 'tokens/trainable': 34618252, 'epoch': '6.055'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 4273/5680 [10:43:14<3:04:25,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 4274/5680 [10:43:23<3:08:24,  8.04s/it]                                                                                                                                                                                                                                             {'loss': '0.4678', 'grad_norm': '0.3834', 'learning_rate': '2.878e-05', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '969.1', 'tokens/total': 35012608, 'tokens/trainable': 34626440, 'epoch': '6.055'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 4274/5680 [10:43:23<3:08:24,  8.04s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4275/5680 [10:43:31<3:06:56,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.361', 'grad_norm': '0.4114', 'learning_rate': '2.874e-05', 'ppl': '1.435', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35020800, 'tokens/trainable': 34634544, 'epoch': '6.055'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4275/5680 [10:43:31<3:06:56,  7.98s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4276/5680 [10:43:39<3:05:55,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3825', 'grad_norm': '0.4368', 'learning_rate': '2.871e-05', 'ppl': '1.466', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35028992, 'tokens/trainable': 34642696, 'epoch': '6.056'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4276/5680 [10:43:39<3:05:55,  7.95s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4277/5680 [10:43:47<3:05:23,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3796', 'grad_norm': '0.4409', 'learning_rate': '2.867e-05', 'ppl': '1.462', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 35037184, 'tokens/trainable': 34650832, 'epoch': '6.056'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4277/5680 [10:43:47<3:05:23,  7.93s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4278/5680 [10:43:54<3:04:42,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4385', 'grad_norm': '0.4685', 'learning_rate': '2.863e-05', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35045376, 'tokens/trainable': 34658984, 'epoch': '6.056'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 4278/5680 [10:43:54<3:04:42,  7.90s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4279/5680 [10:44:02<3:04:19,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4491', 'grad_norm': '0.4591', 'learning_rate': '2.859e-05', 'ppl': '1.567', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35053568, 'tokens/trainable': 34667148, 'epoch': '6.056'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4279/5680 [10:44:02<3:04:19,  7.89s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4280/5680 [10:44:10<3:03:59,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3596', 'grad_norm': '0.4145', 'learning_rate': '2.855e-05', 'ppl': '1.433', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35061760, 'tokens/trainable': 34675272, 'epoch': '6.056'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4280/5680 [10:44:10<3:03:59,  7.89s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4281/5680 [10:44:18<3:03:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5281', 'grad_norm': '0.5718', 'learning_rate': '2.851e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35069952, 'tokens/trainable': 34683392, 'epoch': '6.057'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4281/5680 [10:44:18<3:03:19,  7.86s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4282/5680 [10:44:26<3:03:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6415', 'grad_norm': '0.403', 'learning_rate': '2.847e-05', 'ppl': '1.899', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35078144, 'tokens/trainable': 34691552, 'epoch': '6.057'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 4282/5680 [10:44:26<3:03:12,  7.86s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4283/5680 [10:44:34<3:02:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4133', 'grad_norm': '0.3699', 'learning_rate': '2.843e-05', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 35086336, 'tokens/trainable': 34699720, 'epoch': '6.057'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4283/5680 [10:44:34<3:02:46,  7.85s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4284/5680 [10:44:41<3:02:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3897', 'grad_norm': '0.4211', 'learning_rate': '2.84e-05', 'ppl': '1.477', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35094528, 'tokens/trainable': 34707836, 'epoch': '6.057'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4284/5680 [10:44:41<3:02:43,  7.85s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4285/5680 [10:44:49<3:02:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2913', 'grad_norm': '0.3249', 'learning_rate': '2.836e-05', 'ppl': '1.338', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 35102720, 'tokens/trainable': 34715952, 'epoch': '6.057'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4285/5680 [10:44:49<3:02:51,  7.87s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4286/5680 [10:44:57<3:02:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2907', 'grad_norm': '0.4055', 'learning_rate': '2.832e-05', 'ppl': '1.337', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 35110912, 'tokens/trainable': 34724048, 'epoch': '6.057'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 4286/5680 [10:44:57<3:02:41,  7.86s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 4287/5680 [10:45:05<3:02:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.389', 'grad_norm': '0.4079', 'learning_rate': '2.828e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 35119104, 'tokens/trainable': 34732192, 'epoch': '6.058'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 4287/5680 [10:45:05<3:02:11,  7.85s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 4288/5680 [10:45:13<3:01:55,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4157', 'grad_norm': '0.4561', 'learning_rate': '2.824e-05', 'ppl': '1.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35127296, 'tokens/trainable': 34740276, 'epoch': '6.058'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 4288/5680 [10:45:13<3:01:55,  7.84s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 4289/5680 [10:45:21<3:01:39,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5483', 'grad_norm': '0.3901', 'learning_rate': '2.82e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 35135488, 'tokens/trainable': 34748444, 'epoch': '6.058'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 4289/5680 [10:45:21<3:01:39,  7.84s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4290/5680 [10:45:29<3:02:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5096', 'grad_norm': '0.3774', 'learning_rate': '2.816e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35143680, 'tokens/trainable': 34756628, 'epoch': '6.058'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4290/5680 [10:45:29<3:02:00,  7.86s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4291/5680 [10:45:36<3:01:34,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7018', 'grad_norm': '0.4387', 'learning_rate': '2.813e-05', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1048', 'tokens/total': 35151872, 'tokens/trainable': 34764812, 'epoch': '6.058'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4291/5680 [10:45:36<3:01:34,  7.84s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4292/5680 [10:45:44<3:01:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4364', 'grad_norm': '0.3964', 'learning_rate': '2.809e-05', 'ppl': '1.547', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35160064, 'tokens/trainable': 34772948, 'epoch': '6.058'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4292/5680 [10:45:44<3:01:45,  7.86s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4293/5680 [10:45:52<3:01:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6253', 'grad_norm': '0.415', 'learning_rate': '2.805e-05', 'ppl': '1.869', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 35168256, 'tokens/trainable': 34781120, 'epoch': '6.059'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 4293/5680 [10:45:52<3:01:23,  7.85s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4294/5680 [10:46:00<3:01:11,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4269', 'grad_norm': '0.3969', 'learning_rate': '2.801e-05', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35176448, 'tokens/trainable': 34789252, 'epoch': '6.059'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4294/5680 [10:46:00<3:01:11,  7.84s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4295/5680 [10:46:08<3:00:59,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5268', 'grad_norm': '0.4209', 'learning_rate': '2.797e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35184640, 'tokens/trainable': 34797408, 'epoch': '6.059'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4295/5680 [10:46:08<3:00:59,  7.84s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4296/5680 [10:46:16<3:01:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4105', 'grad_norm': '0.4139', 'learning_rate': '2.793e-05', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35192832, 'tokens/trainable': 34805552, 'epoch': '6.059'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4296/5680 [10:46:16<3:01:13,  7.86s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4297/5680 [10:46:24<3:01:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4056', 'grad_norm': '0.3911', 'learning_rate': '2.79e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35201024, 'tokens/trainable': 34813700, 'epoch': '6.059'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 4297/5680 [10:46:24<3:01:10,  7.86s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 4298/5680 [10:46:31<3:01:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5313', 'grad_norm': '0.3787', 'learning_rate': '2.786e-05', 'ppl': '1.701', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 35209216, 'tokens/trainable': 34821848, 'epoch': '6.06'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 4298/5680 [10:46:31<3:01:10,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 4299/5680 [10:46:39<3:01:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5882', 'grad_norm': '0.4165', 'learning_rate': '2.782e-05', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 35217408, 'tokens/trainable': 34829972, 'epoch': '6.06'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 4299/5680 [10:46:39<3:01:08,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 4300/5680 [10:46:47<3:01:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6005', 'grad_norm': '0.4184', 'learning_rate': '2.778e-05', 'ppl': '1.823', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35225600, 'tokens/trainable': 34838096, 'epoch': '6.06'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 4300/5680 [10:46:47<3:01:00,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4301/5680 [10:46:55<3:00:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4296', 'grad_norm': '0.3612', 'learning_rate': '2.774e-05', 'ppl': '1.537', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35233792, 'tokens/trainable': 34846208, 'epoch': '6.06'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4301/5680 [10:46:55<3:00:48,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4302/5680 [10:47:03<3:00:58,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2913', 'grad_norm': '0.4399', 'learning_rate': '2.77e-05', 'ppl': '1.338', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 35241984, 'tokens/trainable': 34854296, 'epoch': '6.06'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4302/5680 [10:47:03<3:00:58,  7.88s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4303/5680 [10:47:11<3:00:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6048', 'grad_norm': '0.4171', 'learning_rate': '2.767e-05', 'ppl': '1.831', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35250176, 'tokens/trainable': 34862432, 'epoch': '6.06'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4303/5680 [10:47:11<3:00:37,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4304/5680 [10:47:19<3:00:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5507', 'grad_norm': '0.3756', 'learning_rate': '2.763e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35258368, 'tokens/trainable': 34870572, 'epoch': '6.061'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 4304/5680 [10:47:19<3:00:12,  7.86s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4305/5680 [10:47:26<3:00:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.457', 'grad_norm': '0.5001', 'learning_rate': '2.759e-05', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35266560, 'tokens/trainable': 34878736, 'epoch': '6.061'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4305/5680 [10:47:26<3:00:00,  7.85s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4306/5680 [10:47:34<3:00:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3023', 'grad_norm': '0.3828', 'learning_rate': '2.755e-05', 'ppl': '1.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 35274752, 'tokens/trainable': 34886824, 'epoch': '6.061'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4306/5680 [10:47:34<3:00:25,  7.88s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4307/5680 [10:47:42<3:00:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6852', 'grad_norm': '0.4839', 'learning_rate': '2.751e-05', 'ppl': '1.984', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35282944, 'tokens/trainable': 34894956, 'epoch': '6.061'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4307/5680 [10:47:42<3:00:04,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4308/5680 [10:47:50<2:59:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6462', 'grad_norm': '0.4033', 'learning_rate': '2.748e-05', 'ppl': '1.908', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 35291136, 'tokens/trainable': 34903136, 'epoch': '6.061'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 4308/5680 [10:47:50<2:59:46,  7.86s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4309/5680 [10:47:58<2:59:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5597', 'grad_norm': '0.3864', 'learning_rate': '2.744e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35299328, 'tokens/trainable': 34911320, 'epoch': '6.061'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4309/5680 [10:47:58<2:59:43,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4310/5680 [10:48:06<2:59:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4328', 'grad_norm': '0.4135', 'learning_rate': '2.74e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35307520, 'tokens/trainable': 34919476, 'epoch': '6.062'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4310/5680 [10:48:06<2:59:44,  7.87s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4311/5680 [10:48:14<2:59:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5235', 'grad_norm': '0.3749', 'learning_rate': '2.736e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35315712, 'tokens/trainable': 34927580, 'epoch': '6.062'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4311/5680 [10:48:14<2:59:06,  7.85s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4312/5680 [10:48:21<2:58:44,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6538', 'grad_norm': '0.4062', 'learning_rate': '2.732e-05', 'ppl': '1.923', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 35323904, 'tokens/trainable': 34935760, 'epoch': '6.062'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 4312/5680 [10:48:21<2:58:44,  7.84s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 4313/5680 [10:48:29<2:58:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7849', 'grad_norm': '0.5444', 'learning_rate': '2.729e-05', 'ppl': '2.192', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35332096, 'tokens/trainable': 34943936, 'epoch': '6.062'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 4313/5680 [10:48:29<2:58:51,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 4314/5680 [10:48:37<2:58:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2507', 'grad_norm': '0.3334', 'learning_rate': '2.725e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35340288, 'tokens/trainable': 34952056, 'epoch': '6.062'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 4314/5680 [10:48:37<2:58:42,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 4315/5680 [10:48:45<2:58:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3717', 'grad_norm': '0.4072', 'learning_rate': '2.721e-05', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35348480, 'tokens/trainable': 34960224, 'epoch': '6.062'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 4315/5680 [10:48:45<2:58:35,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4316/5680 [10:48:53<2:58:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3513', 'grad_norm': '0.4312', 'learning_rate': '2.717e-05', 'ppl': '1.421', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35356672, 'tokens/trainable': 34968336, 'epoch': '6.063'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4316/5680 [10:48:53<2:58:31,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4317/5680 [10:49:01<2:58:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2783', 'grad_norm': '0.3551', 'learning_rate': '2.713e-05', 'ppl': '1.321', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35364864, 'tokens/trainable': 34976504, 'epoch': '6.063'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4317/5680 [10:49:01<2:58:23,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4318/5680 [10:49:09<2:58:03,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5229', 'grad_norm': '0.377', 'learning_rate': '2.71e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35373056, 'tokens/trainable': 34984636, 'epoch': '6.063'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4318/5680 [10:49:09<2:58:03,  7.84s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4319/5680 [10:49:16<2:57:55,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5107', 'grad_norm': '0.4421', 'learning_rate': '2.706e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35381248, 'tokens/trainable': 34992756, 'epoch': '6.063'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 4319/5680 [10:49:16<2:57:55,  7.84s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4320/5680 [10:49:24<2:57:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3549', 'grad_norm': '0.3722', 'learning_rate': '2.702e-05', 'ppl': '1.426', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35389440, 'tokens/trainable': 35000924, 'epoch': '6.063'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4320/5680 [10:49:24<2:57:51,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4321/5680 [10:49:32<2:57:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5531', 'grad_norm': '0.4779', 'learning_rate': '2.698e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35397632, 'tokens/trainable': 35009016, 'epoch': '6.064'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4321/5680 [10:49:32<2:57:42,  7.85s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4322/5680 [10:49:40<2:57:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4719', 'grad_norm': '0.4912', 'learning_rate': '2.694e-05', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35405824, 'tokens/trainable': 35017172, 'epoch': '6.064'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4322/5680 [10:49:40<2:57:50,  7.86s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4323/5680 [10:49:48<2:57:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4218', 'grad_norm': '0.3808', 'learning_rate': '2.691e-05', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35414016, 'tokens/trainable': 35025336, 'epoch': '6.064'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 4323/5680 [10:49:48<2:57:45,  7.86s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 4324/5680 [10:49:56<2:57:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5598', 'grad_norm': '0.4622', 'learning_rate': '2.687e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35422208, 'tokens/trainable': 35033520, 'epoch': '6.064'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 4324/5680 [10:49:56<2:57:54,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 4325/5680 [10:50:04<2:57:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2673', 'grad_norm': '0.3328', 'learning_rate': '2.683e-05', 'ppl': '1.306', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35430400, 'tokens/trainable': 35041640, 'epoch': '6.064'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 4325/5680 [10:50:04<2:57:48,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 4326/5680 [10:50:11<2:57:35,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3141', 'grad_norm': '0.4621', 'learning_rate': '2.679e-05', 'ppl': '1.369', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35438592, 'tokens/trainable': 35049784, 'epoch': '6.064'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 4326/5680 [10:50:11<2:57:35,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4327/5680 [10:50:19<2:57:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6381', 'grad_norm': '0.4143', 'learning_rate': '2.676e-05', 'ppl': '1.893', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35446784, 'tokens/trainable': 35057960, 'epoch': '6.065'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4327/5680 [10:50:19<2:57:33,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4328/5680 [10:50:27<2:57:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4081', 'grad_norm': '0.387', 'learning_rate': '2.672e-05', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35454976, 'tokens/trainable': 35066112, 'epoch': '6.065'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4328/5680 [10:50:27<2:57:24,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4329/5680 [10:50:35<2:57:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3855', 'grad_norm': '0.4143', 'learning_rate': '2.668e-05', 'ppl': '1.47', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35463168, 'tokens/trainable': 35074288, 'epoch': '6.065'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4329/5680 [10:50:35<2:57:18,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4330/5680 [10:50:43<2:57:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5858', 'grad_norm': '0.4006', 'learning_rate': '2.664e-05', 'ppl': '1.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35471360, 'tokens/trainable': 35082404, 'epoch': '6.065'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 4330/5680 [10:50:43<2:57:05,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4331/5680 [10:50:51<2:57:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4244', 'grad_norm': '0.5428', 'learning_rate': '2.661e-05', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 35479552, 'tokens/trainable': 35090504, 'epoch': '6.065'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4331/5680 [10:50:51<2:57:10,  7.88s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4332/5680 [10:50:59<2:56:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4451', 'grad_norm': '0.4452', 'learning_rate': '2.657e-05', 'ppl': '1.561', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35487744, 'tokens/trainable': 35098616, 'epoch': '6.065'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4332/5680 [10:50:59<2:56:45,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4333/5680 [10:51:07<2:56:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6179', 'grad_norm': '0.4569', 'learning_rate': '2.653e-05', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 35495936, 'tokens/trainable': 35106800, 'epoch': '6.066'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4333/5680 [10:51:07<2:56:33,  7.86s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4334/5680 [10:51:14<2:56:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.536', 'grad_norm': '0.4473', 'learning_rate': '2.649e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35504128, 'tokens/trainable': 35114956, 'epoch': '6.066'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 4334/5680 [10:51:14<2:56:16,  7.86s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4335/5680 [10:51:22<2:56:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7108', 'grad_norm': '0.4362', 'learning_rate': '2.646e-05', 'ppl': '2.036', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35512320, 'tokens/trainable': 35123120, 'epoch': '6.066'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4335/5680 [10:51:22<2:56:15,  7.86s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4336/5680 [10:51:30<2:56:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3198', 'grad_norm': '0.3634', 'learning_rate': '2.642e-05', 'ppl': '1.377', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35520512, 'tokens/trainable': 35131288, 'epoch': '6.066'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4336/5680 [10:51:30<2:56:07,  7.86s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4337/5680 [10:51:38<2:56:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4161', 'grad_norm': '0.4222', 'learning_rate': '2.638e-05', 'ppl': '1.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35528704, 'tokens/trainable': 35139460, 'epoch': '6.066'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4337/5680 [10:51:38<2:56:09,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4338/5680 [10:51:46<2:56:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2924', 'grad_norm': '0.3543', 'learning_rate': '2.634e-05', 'ppl': '1.34', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35536896, 'tokens/trainable': 35147624, 'epoch': '6.067'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 4338/5680 [10:51:46<2:56:05,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 4339/5680 [10:51:54<2:55:59,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.602', 'grad_norm': '0.4038', 'learning_rate': '2.631e-05', 'ppl': '1.826', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35545088, 'tokens/trainable': 35155808, 'epoch': '6.067'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 4339/5680 [10:51:54<2:55:59,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 4340/5680 [10:52:02<2:55:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5994', 'grad_norm': '0.4126', 'learning_rate': '2.627e-05', 'ppl': '1.821', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 35553280, 'tokens/trainable': 35163908, 'epoch': '6.067'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 4340/5680 [10:52:02<2:55:44,  7.87s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 4341/5680 [10:52:09<2:55:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5432', 'grad_norm': '0.4694', 'learning_rate': '2.623e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 35561472, 'tokens/trainable': 35172084, 'epoch': '6.067'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 4341/5680 [10:52:09<2:55:12,  7.85s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4342/5680 [10:52:17<2:55:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3641', 'grad_norm': '0.4229', 'learning_rate': '2.619e-05', 'ppl': '1.439', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35569664, 'tokens/trainable': 35180248, 'epoch': '6.067'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4342/5680 [10:52:17<2:55:17,  7.86s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4343/5680 [10:52:25<2:55:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4707', 'grad_norm': '0.4009', 'learning_rate': '2.616e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35577856, 'tokens/trainable': 35188408, 'epoch': '6.067'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4343/5680 [10:52:25<2:55:11,  7.86s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4344/5680 [10:52:33<2:54:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5883', 'grad_norm': '0.4193', 'learning_rate': '2.612e-05', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 35586048, 'tokens/trainable': 35196552, 'epoch': '6.068'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4344/5680 [10:52:33<2:54:44,  7.85s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4345/5680 [10:52:41<2:54:36,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3887', 'grad_norm': '0.3635', 'learning_rate': '2.608e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35594240, 'tokens/trainable': 35204696, 'epoch': '6.068'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 4345/5680 [10:52:41<2:54:36,  7.85s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4346/5680 [10:52:49<2:56:18,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.386', 'grad_norm': '0.4018', 'learning_rate': '2.604e-05', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 35602432, 'tokens/trainable': 35212836, 'epoch': '6.068'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4346/5680 [10:52:49<2:56:18,  7.93s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4347/5680 [10:52:57<2:55:45,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4258', 'grad_norm': '0.3914', 'learning_rate': '2.601e-05', 'ppl': '1.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35610624, 'tokens/trainable': 35220968, 'epoch': '6.068'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4347/5680 [10:52:57<2:55:45,  7.91s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4348/5680 [10:53:05<2:55:16,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3382', 'grad_norm': '0.3751', 'learning_rate': '2.597e-05', 'ppl': '1.402', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35618816, 'tokens/trainable': 35229148, 'epoch': '6.068'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4348/5680 [10:53:05<2:55:16,  7.90s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4349/5680 [10:53:13<2:54:55,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6355', 'grad_norm': '0.4348', 'learning_rate': '2.593e-05', 'ppl': '1.888', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35627008, 'tokens/trainable': 35237312, 'epoch': '6.068'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 4349/5680 [10:53:13<2:54:55,  7.89s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 4350/5680 [10:53:20<2:54:47,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4693', 'grad_norm': '0.5208', 'learning_rate': '2.59e-05', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35635200, 'tokens/trainable': 35245480, 'epoch': '6.069'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 4350/5680 [10:53:20<2:54:47,  7.89s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 4351/5680 [10:53:28<2:54:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4388', 'grad_norm': '0.3794', 'learning_rate': '2.586e-05', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35643392, 'tokens/trainable': 35253616, 'epoch': '6.069'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 4351/5680 [10:53:28<2:54:19,  7.87s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 4352/5680 [10:53:36<2:54:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5979', 'grad_norm': '0.6435', 'learning_rate': '2.582e-05', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35651584, 'tokens/trainable': 35261780, 'epoch': '6.069'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 4352/5680 [10:53:36<2:54:12,  7.87s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4353/5680 [10:53:44<2:55:55,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5463', 'grad_norm': '0.4241', 'learning_rate': '2.578e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 35659776, 'tokens/trainable': 35269908, 'epoch': '6.069'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4353/5680 [10:53:44<2:55:55,  7.95s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4354/5680 [10:53:52<2:55:00,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3884', 'grad_norm': '0.4473', 'learning_rate': '2.575e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35667968, 'tokens/trainable': 35278016, 'epoch': '6.069'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4354/5680 [10:53:52<2:55:00,  7.92s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4355/5680 [10:54:00<2:54:29,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6935', 'grad_norm': '0.5589', 'learning_rate': '2.571e-05', 'ppl': '2.001', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35676160, 'tokens/trainable': 35286148, 'epoch': '6.07'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4355/5680 [10:54:00<2:54:29,  7.90s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4356/5680 [10:54:08<2:54:03,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5172', 'grad_norm': '0.3988', 'learning_rate': '2.567e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 35684352, 'tokens/trainable': 35294264, 'epoch': '6.07'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 4356/5680 [10:54:08<2:54:03,  7.89s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4357/5680 [10:54:16<2:53:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3229', 'grad_norm': '0.4349', 'learning_rate': '2.564e-05', 'ppl': '1.381', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35692544, 'tokens/trainable': 35302412, 'epoch': '6.07'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4357/5680 [10:54:16<2:53:40,  7.88s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4358/5680 [10:54:24<2:53:52,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3638', 'grad_norm': '0.3789', 'learning_rate': '2.56e-05', 'ppl': '1.439', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 35700736, 'tokens/trainable': 35310604, 'epoch': '6.07'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4358/5680 [10:54:24<2:53:52,  7.89s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4359/5680 [10:54:31<2:53:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3821', 'grad_norm': '0.3675', 'learning_rate': '2.556e-05', 'ppl': '1.465', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35708928, 'tokens/trainable': 35318696, 'epoch': '6.07'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4359/5680 [10:54:31<2:53:18,  7.87s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4360/5680 [10:54:39<2:53:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.188', 'grad_norm': '0.2726', 'learning_rate': '2.553e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35717120, 'tokens/trainable': 35326836, 'epoch': '6.07'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 4360/5680 [10:54:39<2:53:18,  7.88s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4361/5680 [10:54:47<2:52:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5285', 'grad_norm': '0.3924', 'learning_rate': '2.549e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 35725312, 'tokens/trainable': 35335004, 'epoch': '6.071'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4361/5680 [10:54:47<2:52:48,  7.86s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4362/5680 [10:54:55<2:53:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3696', 'grad_norm': '0.4135', 'learning_rate': '2.545e-05', 'ppl': '1.447', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35733504, 'tokens/trainable': 35343172, 'epoch': '6.071'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4362/5680 [10:54:55<2:53:00,  7.88s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4363/5680 [10:55:03<2:52:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5821', 'grad_norm': '0.4241', 'learning_rate': '2.541e-05', 'ppl': '1.79', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 35741696, 'tokens/trainable': 35351312, 'epoch': '6.071'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4363/5680 [10:55:03<2:52:27,  7.86s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4364/5680 [10:55:11<2:52:03,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.731', 'grad_norm': '0.4008', 'learning_rate': '2.538e-05', 'ppl': '2.077', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 35749888, 'tokens/trainable': 35359440, 'epoch': '6.071'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 4364/5680 [10:55:11<2:52:03,  7.84s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 4365/5680 [10:55:19<2:52:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3748', 'grad_norm': '0.3917', 'learning_rate': '2.534e-05', 'ppl': '1.455', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35758080, 'tokens/trainable': 35367612, 'epoch': '6.071'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 4365/5680 [10:55:19<2:52:03,  7.85s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 4366/5680 [10:55:26<2:52:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3738', 'grad_norm': '0.3539', 'learning_rate': '2.53e-05', 'ppl': '1.453', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 35766272, 'tokens/trainable': 35375700, 'epoch': '6.071'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 4366/5680 [10:55:26<2:52:08,  7.86s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 4367/5680 [10:55:34<2:52:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3763', 'grad_norm': '0.372', 'learning_rate': '2.527e-05', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35774464, 'tokens/trainable': 35383880, 'epoch': '6.072'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 4367/5680 [10:55:34<2:52:23,  7.88s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4368/5680 [10:55:42<2:52:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5125', 'grad_norm': '0.3845', 'learning_rate': '2.523e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 35782656, 'tokens/trainable': 35391968, 'epoch': '6.072'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4368/5680 [10:55:42<2:52:00,  7.87s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4369/5680 [10:55:50<2:51:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4224', 'grad_norm': '0.4671', 'learning_rate': '2.519e-05', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 35790848, 'tokens/trainable': 35400140, 'epoch': '6.072'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4369/5680 [10:55:50<2:51:39,  7.86s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4370/5680 [10:55:58<2:51:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4085', 'grad_norm': '0.5403', 'learning_rate': '2.516e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35799040, 'tokens/trainable': 35408256, 'epoch': '6.072'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4370/5680 [10:55:58<2:51:33,  7.86s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4371/5680 [10:56:06<2:51:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4144', 'grad_norm': '0.3955', 'learning_rate': '2.512e-05', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35807232, 'tokens/trainable': 35416420, 'epoch': '6.072'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 4371/5680 [10:56:06<2:51:29,  7.86s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4372/5680 [10:56:14<2:51:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4274', 'grad_norm': '0.4858', 'learning_rate': '2.508e-05', 'ppl': '1.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35815424, 'tokens/trainable': 35424584, 'epoch': '6.073'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4372/5680 [10:56:14<2:51:39,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4373/5680 [10:56:22<2:51:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3403', 'grad_norm': '0.4434', 'learning_rate': '2.505e-05', 'ppl': '1.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35823616, 'tokens/trainable': 35432768, 'epoch': '6.073'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4373/5680 [10:56:22<2:51:40,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4374/5680 [10:56:29<2:51:26,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4204', 'grad_norm': '0.4383', 'learning_rate': '2.501e-05', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 35831808, 'tokens/trainable': 35440920, 'epoch': '6.073'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4374/5680 [10:56:29<2:51:26,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4375/5680 [10:56:37<2:51:38,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4029', 'grad_norm': '0.4239', 'learning_rate': '2.497e-05', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 35840000, 'tokens/trainable': 35449084, 'epoch': '6.073'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 4375/5680 [10:56:37<2:51:38,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 4376/5680 [10:56:45<2:51:34,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.508', 'grad_norm': '0.4131', 'learning_rate': '2.494e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35848192, 'tokens/trainable': 35457244, 'epoch': '6.073'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 4376/5680 [10:56:45<2:51:34,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 4377/5680 [10:56:53<2:51:24,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7149', 'grad_norm': '0.4371', 'learning_rate': '2.49e-05', 'ppl': '2.044', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 35856384, 'tokens/trainable': 35465364, 'epoch': '6.073'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 4377/5680 [10:56:53<2:51:24,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 4378/5680 [10:57:01<2:51:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2492', 'grad_norm': '0.3407', 'learning_rate': '2.486e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 35864576, 'tokens/trainable': 35473480, 'epoch': '6.074'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 4378/5680 [10:57:01<2:51:05,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4379/5680 [10:57:09<2:51:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4601', 'grad_norm': '0.426', 'learning_rate': '2.483e-05', 'ppl': '1.584', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 35872768, 'tokens/trainable': 35481592, 'epoch': '6.074'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4379/5680 [10:57:09<2:51:07,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4380/5680 [10:57:17<2:50:54,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5448', 'grad_norm': '0.4406', 'learning_rate': '2.479e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 35880960, 'tokens/trainable': 35489676, 'epoch': '6.074'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4380/5680 [10:57:17<2:50:54,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4381/5680 [10:57:25<2:50:32,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3432', 'grad_norm': '0.4113', 'learning_rate': '2.476e-05', 'ppl': '1.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 35889152, 'tokens/trainable': 35497804, 'epoch': '6.074'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4381/5680 [10:57:25<2:50:32,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4382/5680 [10:57:33<2:50:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4672', 'grad_norm': '0.3906', 'learning_rate': '2.472e-05', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 35897344, 'tokens/trainable': 35505972, 'epoch': '6.074'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 4382/5680 [10:57:33<2:50:29,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4383/5680 [10:57:40<2:50:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3845', 'grad_norm': '0.462', 'learning_rate': '2.468e-05', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 35905536, 'tokens/trainable': 35514136, 'epoch': '6.074'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4383/5680 [10:57:40<2:50:05,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4384/5680 [10:57:48<2:49:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5564', 'grad_norm': '0.494', 'learning_rate': '2.465e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35913728, 'tokens/trainable': 35522304, 'epoch': '6.075'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4384/5680 [10:57:48<2:49:57,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4385/5680 [10:57:56<2:49:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5253', 'grad_norm': '0.4376', 'learning_rate': '2.461e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 35921920, 'tokens/trainable': 35530456, 'epoch': '6.075'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4385/5680 [10:57:56<2:49:55,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4386/5680 [10:58:04<2:50:08,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3889', 'grad_norm': '0.4433', 'learning_rate': '2.457e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 35930112, 'tokens/trainable': 35538572, 'epoch': '6.075'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 4386/5680 [10:58:04<2:50:08,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4387/5680 [10:58:12<2:50:02,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.7058', 'grad_norm': '0.4753', 'learning_rate': '2.454e-05', 'ppl': '2.026', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 35938304, 'tokens/trainable': 35546704, 'epoch': '6.075'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4387/5680 [10:58:12<2:50:02,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4388/5680 [10:58:20<2:49:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3946', 'grad_norm': '0.3674', 'learning_rate': '2.45e-05', 'ppl': '1.484', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35946496, 'tokens/trainable': 35554880, 'epoch': '6.075'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4388/5680 [10:58:20<2:49:50,  7.89s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4389/5680 [10:58:28<2:49:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.591', 'grad_norm': '0.4535', 'learning_rate': '2.446e-05', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 35954688, 'tokens/trainable': 35563072, 'epoch': '6.076'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4389/5680 [10:58:28<2:49:27,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4390/5680 [10:58:36<2:49:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.377', 'grad_norm': '0.4231', 'learning_rate': '2.443e-05', 'ppl': '1.458', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 35962880, 'tokens/trainable': 35571192, 'epoch': '6.076'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 4390/5680 [10:58:36<2:49:24,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4391/5680 [10:58:43<2:49:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.424', 'grad_norm': '0.4222', 'learning_rate': '2.439e-05', 'ppl': '1.528', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 35971072, 'tokens/trainable': 35579372, 'epoch': '6.076'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4391/5680 [10:58:43<2:49:07,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4392/5680 [10:58:51<2:49:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6412', 'grad_norm': '0.4462', 'learning_rate': '2.436e-05', 'ppl': '1.899', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 35979264, 'tokens/trainable': 35587488, 'epoch': '6.076'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4392/5680 [10:58:51<2:49:02,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4393/5680 [10:58:59<2:48:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5337', 'grad_norm': '0.4044', 'learning_rate': '2.432e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 35987456, 'tokens/trainable': 35595656, 'epoch': '6.076'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4393/5680 [10:58:59<2:48:49,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4394/5680 [10:59:07<2:48:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2933', 'grad_norm': '0.3748', 'learning_rate': '2.428e-05', 'ppl': '1.341', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 35995648, 'tokens/trainable': 35603808, 'epoch': '6.076'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4394/5680 [10:59:07<2:48:36,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4395/5680 [10:59:15<2:48:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.649', 'grad_norm': '0.3956', 'learning_rate': '2.425e-05', 'ppl': '1.914', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36003840, 'tokens/trainable': 35611980, 'epoch': '6.077'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4395/5680 [10:59:15<2:48:24,  7.86s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4396/5680 [10:59:23<2:48:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4887', 'grad_norm': '0.4214', 'learning_rate': '2.421e-05', 'ppl': '1.63', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36012032, 'tokens/trainable': 35620108, 'epoch': '6.077'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4396/5680 [10:59:23<2:48:17,  7.86s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4397/5680 [10:59:31<2:48:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4592', 'grad_norm': '0.454', 'learning_rate': '2.418e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36020224, 'tokens/trainable': 35628264, 'epoch': '6.077'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 4397/5680 [10:59:31<2:48:17,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4398/5680 [10:59:39<2:48:17,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5476', 'grad_norm': '0.4113', 'learning_rate': '2.414e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 36028416, 'tokens/trainable': 35636400, 'epoch': '6.077'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4398/5680 [10:59:39<2:48:17,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4399/5680 [10:59:46<2:47:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4546', 'grad_norm': '0.4473', 'learning_rate': '2.41e-05', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 36036608, 'tokens/trainable': 35644564, 'epoch': '6.077'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4399/5680 [10:59:46<2:47:55,  7.87s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4400/5680 [10:59:54<2:48:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4494', 'grad_norm': '0.4125', 'learning_rate': '2.407e-05', 'ppl': '1.567', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 36044800, 'tokens/trainable': 35652696, 'epoch': '6.077'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4400/5680 [10:59:54<2:48:05,  7.88s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4401/5680 [11:00:02<2:48:02,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5174', 'grad_norm': '0.4573', 'learning_rate': '2.403e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 36052992, 'tokens/trainable': 35660828, 'epoch': '6.078'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 4401/5680 [11:00:02<2:48:02,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 4402/5680 [11:00:10<2:47:41,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3916', 'grad_norm': '0.3611', 'learning_rate': '2.4e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 36061184, 'tokens/trainable': 35669012, 'epoch': '6.078'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 4402/5680 [11:00:10<2:47:41,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 4403/5680 [11:00:18<2:47:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3993', 'grad_norm': '0.3924', 'learning_rate': '2.396e-05', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36069376, 'tokens/trainable': 35677180, 'epoch': '6.078'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 4403/5680 [11:00:18<2:47:28,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 4404/5680 [11:00:26<2:47:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5396', 'grad_norm': '0.5071', 'learning_rate': '2.392e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36077568, 'tokens/trainable': 35685320, 'epoch': '6.078'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 4404/5680 [11:00:26<2:47:18,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4405/5680 [11:00:34<2:46:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4313', 'grad_norm': '0.3817', 'learning_rate': '2.389e-05', 'ppl': '1.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 36085760, 'tokens/trainable': 35693504, 'epoch': '6.078'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4405/5680 [11:00:34<2:46:53,  7.85s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4406/5680 [11:00:41<2:46:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4381', 'grad_norm': '0.4792', 'learning_rate': '2.385e-05', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 36093952, 'tokens/trainable': 35701620, 'epoch': '6.079'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4406/5680 [11:00:41<2:46:37,  7.85s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4407/5680 [11:00:49<2:46:24,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5421', 'grad_norm': '0.4517', 'learning_rate': '2.382e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 36102144, 'tokens/trainable': 35709788, 'epoch': '6.079'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4407/5680 [11:00:49<2:46:24,  7.84s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4408/5680 [11:00:57<2:46:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6275', 'grad_norm': '0.415', 'learning_rate': '2.378e-05', 'ppl': '1.873', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 36110336, 'tokens/trainable': 35717880, 'epoch': '6.079'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 4408/5680 [11:00:57<2:46:49,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4409/5680 [11:01:05<2:46:21,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7718', 'grad_norm': '0.4616', 'learning_rate': '2.374e-05', 'ppl': '2.164', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36118528, 'tokens/trainable': 35726000, 'epoch': '6.079'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4409/5680 [11:01:05<2:46:21,  7.85s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4410/5680 [11:01:13<2:46:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3933', 'grad_norm': '0.3824', 'learning_rate': '2.371e-05', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 36126720, 'tokens/trainable': 35734096, 'epoch': '6.079'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4410/5680 [11:01:13<2:46:23,  7.86s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4411/5680 [11:01:21<2:46:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3832', 'grad_norm': '0.4009', 'learning_rate': '2.367e-05', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 36134912, 'tokens/trainable': 35742220, 'epoch': '6.079'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4411/5680 [11:01:21<2:46:28,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4412/5680 [11:01:29<2:46:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4224', 'grad_norm': '0.4364', 'learning_rate': '2.364e-05', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 36143104, 'tokens/trainable': 35750324, 'epoch': '6.08'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4412/5680 [11:01:29<2:46:14,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4413/5680 [11:01:37<2:46:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3013', 'grad_norm': '0.4333', 'learning_rate': '2.36e-05', 'ppl': '1.352', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 36151296, 'tokens/trainable': 35758432, 'epoch': '6.08'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4413/5680 [11:01:37<2:46:14,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4414/5680 [11:01:44<2:46:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4625', 'grad_norm': '0.3737', 'learning_rate': '2.357e-05', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 36159488, 'tokens/trainable': 35766528, 'epoch': '6.08'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4414/5680 [11:01:44<2:46:06,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4415/5680 [11:01:52<2:45:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4327', 'grad_norm': '0.3994', 'learning_rate': '2.353e-05', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 36167680, 'tokens/trainable': 35774672, 'epoch': '6.08'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4415/5680 [11:01:52<2:45:49,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4416/5680 [11:02:00<2:45:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3913', 'grad_norm': '0.3648', 'learning_rate': '2.349e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36175872, 'tokens/trainable': 35782856, 'epoch': '6.08'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 4416/5680 [11:02:00<2:45:43,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4417/5680 [11:02:08<2:45:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2785', 'grad_norm': '0.4223', 'learning_rate': '2.346e-05', 'ppl': '1.321', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 36184064, 'tokens/trainable': 35790956, 'epoch': '6.08'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4417/5680 [11:02:08<2:45:46,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4418/5680 [11:02:16<2:45:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6754', 'grad_norm': '0.5167', 'learning_rate': '2.342e-05', 'ppl': '1.965', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36192256, 'tokens/trainable': 35799088, 'epoch': '6.081'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4418/5680 [11:02:16<2:45:32,  7.87s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4419/5680 [11:02:24<2:45:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5385', 'grad_norm': '0.4407', 'learning_rate': '2.339e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 36200448, 'tokens/trainable': 35807268, 'epoch': '6.081'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4419/5680 [11:02:24<2:45:35,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4420/5680 [11:02:32<2:45:38,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2304', 'grad_norm': '0.3381', 'learning_rate': '2.335e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 36208640, 'tokens/trainable': 35815420, 'epoch': '6.081'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4420/5680 [11:02:32<2:45:38,  7.89s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4421/5680 [11:02:40<2:45:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3026', 'grad_norm': '0.3528', 'learning_rate': '2.332e-05', 'ppl': '1.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 36216832, 'tokens/trainable': 35823580, 'epoch': '6.081'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4421/5680 [11:02:40<2:45:25,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4422/5680 [11:02:47<2:45:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.584', 'grad_norm': '0.4391', 'learning_rate': '2.328e-05', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 36225024, 'tokens/trainable': 35831672, 'epoch': '6.081'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4422/5680 [11:02:47<2:45:13,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4423/5680 [11:02:55<2:45:10,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6647', 'grad_norm': '0.5389', 'learning_rate': '2.325e-05', 'ppl': '1.944', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 36233216, 'tokens/trainable': 35839776, 'epoch': '6.082'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 4423/5680 [11:02:55<2:45:10,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4424/5680 [11:03:03<2:44:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4464', 'grad_norm': '0.4215', 'learning_rate': '2.321e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 36241408, 'tokens/trainable': 35847936, 'epoch': '6.082'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4424/5680 [11:03:03<2:44:56,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4425/5680 [11:03:11<2:46:23,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5837', 'grad_norm': '0.4191', 'learning_rate': '2.317e-05', 'ppl': '1.793', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 36249600, 'tokens/trainable': 35856096, 'epoch': '6.082'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4425/5680 [11:03:11<2:46:23,  7.96s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4426/5680 [11:03:19<2:45:36,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5088', 'grad_norm': '0.4689', 'learning_rate': '2.314e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 36257792, 'tokens/trainable': 35864272, 'epoch': '6.082'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4426/5680 [11:03:19<2:45:36,  7.92s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4427/5680 [11:03:27<2:44:51,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.504', 'grad_norm': '0.3808', 'learning_rate': '2.31e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 36265984, 'tokens/trainable': 35872432, 'epoch': '6.082'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 4427/5680 [11:03:27<2:44:51,  7.89s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 4428/5680 [11:03:35<2:44:32,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3369', 'grad_norm': '0.3594', 'learning_rate': '2.307e-05', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 36274176, 'tokens/trainable': 35880592, 'epoch': '6.082'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 4428/5680 [11:03:35<2:44:32,  7.89s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 4429/5680 [11:03:43<2:44:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3443', 'grad_norm': '0.3989', 'learning_rate': '2.303e-05', 'ppl': '1.411', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 36282368, 'tokens/trainable': 35888732, 'epoch': '6.083'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 4429/5680 [11:03:43<2:44:14,  7.88s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 4430/5680 [11:03:51<2:44:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.323', 'grad_norm': '0.358', 'learning_rate': '2.3e-05', 'ppl': '1.381', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 36290560, 'tokens/trainable': 35896808, 'epoch': '6.083'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 4430/5680 [11:03:51<2:44:08,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4431/5680 [11:03:58<2:43:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.585', 'grad_norm': '0.49', 'learning_rate': '2.296e-05', 'ppl': '1.795', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 36298752, 'tokens/trainable': 35904976, 'epoch': '6.083'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4431/5680 [11:03:58<2:43:56,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4432/5680 [11:04:06<2:43:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7166', 'grad_norm': '0.4313', 'learning_rate': '2.293e-05', 'ppl': '2.047', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 36306944, 'tokens/trainable': 35913120, 'epoch': '6.083'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4432/5680 [11:04:06<2:43:45,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4433/5680 [11:04:14<2:43:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5864', 'grad_norm': '0.3931', 'learning_rate': '2.289e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 36315136, 'tokens/trainable': 35921260, 'epoch': '6.083'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4433/5680 [11:04:14<2:43:29,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4434/5680 [11:04:22<2:43:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5005', 'grad_norm': '0.7282', 'learning_rate': '2.286e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 36323328, 'tokens/trainable': 35929380, 'epoch': '6.083'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 4434/5680 [11:04:22<2:43:21,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4435/5680 [11:04:30<2:43:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4555', 'grad_norm': '0.4537', 'learning_rate': '2.282e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 36331520, 'tokens/trainable': 35937532, 'epoch': '6.084'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4435/5680 [11:04:30<2:43:02,  7.86s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4436/5680 [11:04:38<2:43:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2895', 'grad_norm': '0.4413', 'learning_rate': '2.279e-05', 'ppl': '1.336', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 36339712, 'tokens/trainable': 35945696, 'epoch': '6.084'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4436/5680 [11:04:38<2:43:01,  7.86s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4437/5680 [11:04:46<2:43:17,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2395', 'grad_norm': '0.4586', 'learning_rate': '2.275e-05', 'ppl': '1.271', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 36347904, 'tokens/trainable': 35953848, 'epoch': '6.084'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4437/5680 [11:04:46<2:43:17,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4438/5680 [11:04:54<2:42:58,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4259', 'grad_norm': '0.3807', 'learning_rate': '2.272e-05', 'ppl': '1.531', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36356096, 'tokens/trainable': 35962012, 'epoch': '6.084'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 4438/5680 [11:04:54<2:42:58,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4439/5680 [11:05:01<2:42:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3719', 'grad_norm': '0.3376', 'learning_rate': '2.268e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 36364288, 'tokens/trainable': 35970104, 'epoch': '6.084'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4439/5680 [11:05:01<2:42:52,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4440/5680 [11:05:09<2:42:50,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3347', 'grad_norm': '0.4054', 'learning_rate': '2.265e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 36372480, 'tokens/trainable': 35978176, 'epoch': '6.085'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4440/5680 [11:05:09<2:42:50,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4441/5680 [11:05:17<2:44:23,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.754', 'grad_norm': '0.4756', 'learning_rate': '2.261e-05', 'ppl': '2.125', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 36380672, 'tokens/trainable': 35986356, 'epoch': '6.085'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4441/5680 [11:05:17<2:44:23,  7.96s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4442/5680 [11:05:25<2:43:48,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3624', 'grad_norm': '0.4216', 'learning_rate': '2.258e-05', 'ppl': '1.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 36388864, 'tokens/trainable': 35994464, 'epoch': '6.085'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 4442/5680 [11:05:25<2:43:48,  7.94s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 4443/5680 [11:05:33<2:43:05,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5618', 'grad_norm': '0.4423', 'learning_rate': '2.254e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36397056, 'tokens/trainable': 36002576, 'epoch': '6.085'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 4443/5680 [11:05:33<2:43:05,  7.91s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 4444/5680 [11:05:41<2:42:32,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3937', 'grad_norm': '0.4093', 'learning_rate': '2.251e-05', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 36405248, 'tokens/trainable': 36010712, 'epoch': '6.085'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 4444/5680 [11:05:41<2:42:32,  7.89s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 4445/5680 [11:05:49<2:42:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4211', 'grad_norm': '0.3921', 'learning_rate': '2.247e-05', 'ppl': '1.524', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 36413440, 'tokens/trainable': 36018788, 'epoch': '6.085'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 4445/5680 [11:05:49<2:42:08,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4446/5680 [11:05:57<2:41:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3234', 'grad_norm': '0.369', 'learning_rate': '2.244e-05', 'ppl': '1.382', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36421632, 'tokens/trainable': 36026916, 'epoch': '6.086'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4446/5680 [11:05:57<2:41:54,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4447/5680 [11:06:05<2:41:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3158', 'grad_norm': '0.3397', 'learning_rate': '2.24e-05', 'ppl': '1.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36429824, 'tokens/trainable': 36035072, 'epoch': '6.086'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4447/5680 [11:06:05<2:41:37,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4448/5680 [11:06:12<2:41:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4333', 'grad_norm': '0.4638', 'learning_rate': '2.237e-05', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 36438016, 'tokens/trainable': 36043212, 'epoch': '6.086'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4448/5680 [11:06:12<2:41:22,  7.86s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4449/5680 [11:06:20<2:41:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4127', 'grad_norm': '0.4387', 'learning_rate': '2.233e-05', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36446208, 'tokens/trainable': 36051388, 'epoch': '6.086'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 4449/5680 [11:06:20<2:41:19,  7.86s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4450/5680 [11:06:28<2:41:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5918', 'grad_norm': '0.4125', 'learning_rate': '2.23e-05', 'ppl': '1.807', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36454400, 'tokens/trainable': 36059492, 'epoch': '6.086'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4450/5680 [11:06:28<2:41:00,  7.85s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4451/5680 [11:06:36<2:40:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6172', 'grad_norm': '0.4723', 'learning_rate': '2.226e-05', 'ppl': '1.854', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 36462592, 'tokens/trainable': 36067652, 'epoch': '6.086'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4451/5680 [11:06:36<2:40:43,  7.85s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4452/5680 [11:06:44<2:40:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4586', 'grad_norm': '0.3494', 'learning_rate': '2.223e-05', 'ppl': '1.582', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36470784, 'tokens/trainable': 36075828, 'epoch': '6.087'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4452/5680 [11:06:44<2:40:43,  7.85s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4453/5680 [11:06:52<2:40:34,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4988', 'grad_norm': '0.4405', 'learning_rate': '2.219e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 36478976, 'tokens/trainable': 36083992, 'epoch': '6.087'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 4453/5680 [11:06:52<2:40:34,  7.85s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4454/5680 [11:07:00<2:40:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5538', 'grad_norm': '0.4108', 'learning_rate': '2.216e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 36487168, 'tokens/trainable': 36092104, 'epoch': '6.087'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4454/5680 [11:07:00<2:40:33,  7.86s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4455/5680 [11:07:07<2:40:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3979', 'grad_norm': '0.4029', 'learning_rate': '2.212e-05', 'ppl': '1.489', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 36495360, 'tokens/trainable': 36100224, 'epoch': '6.087'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4455/5680 [11:07:07<2:40:36,  7.87s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4456/5680 [11:07:15<2:40:41,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3487', 'grad_norm': '0.38', 'learning_rate': '2.209e-05', 'ppl': '1.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 36503552, 'tokens/trainable': 36108308, 'epoch': '6.087'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4456/5680 [11:07:15<2:40:41,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4457/5680 [11:07:23<2:40:32,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4279', 'grad_norm': '0.4243', 'learning_rate': '2.205e-05', 'ppl': '1.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 36511744, 'tokens/trainable': 36116408, 'epoch': '6.088'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 4457/5680 [11:07:23<2:40:32,  7.88s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4458/5680 [11:07:31<2:40:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.479', 'grad_norm': '0.408', 'learning_rate': '2.202e-05', 'ppl': '1.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36519936, 'tokens/trainable': 36124536, 'epoch': '6.088'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4458/5680 [11:07:31<2:40:05,  7.86s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4459/5680 [11:07:39<2:39:45,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4176', 'grad_norm': '0.4014', 'learning_rate': '2.198e-05', 'ppl': '1.518', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 36528128, 'tokens/trainable': 36132712, 'epoch': '6.088'}
 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4459/5680 [11:07:39<2:39:45,  7.85s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4460/5680 [11:07:47<2:39:34,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6677', 'grad_norm': '0.7399', 'learning_rate': '2.195e-05', 'ppl': '1.95', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36536320, 'tokens/trainable': 36140824, 'epoch': '6.088'}
 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4460/5680 [11:07:47<2:39:34,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4461/5680 [11:07:55<2:39:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4406', 'grad_norm': '0.3793', 'learning_rate': '2.192e-05', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 36544512, 'tokens/trainable': 36148940, 'epoch': '6.088'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4461/5680 [11:07:55<2:39:30,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4462/5680 [11:08:02<2:39:14,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3938', 'grad_norm': '0.4217', 'learning_rate': '2.188e-05', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 36552704, 'tokens/trainable': 36157000, 'epoch': '6.088'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4462/5680 [11:08:02<2:39:14,  7.84s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4463/5680 [11:08:10<2:39:09,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5362', 'grad_norm': '0.4606', 'learning_rate': '2.185e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 36560896, 'tokens/trainable': 36165100, 'epoch': '6.089'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4463/5680 [11:08:10<2:39:09,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4464/5680 [11:08:18<2:39:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4649', 'grad_norm': '0.4401', 'learning_rate': '2.181e-05', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36569088, 'tokens/trainable': 36173268, 'epoch': '6.089'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 4464/5680 [11:08:18<2:39:18,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4465/5680 [11:08:26<2:39:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4337', 'grad_norm': '0.4246', 'learning_rate': '2.178e-05', 'ppl': '1.543', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36577280, 'tokens/trainable': 36181424, 'epoch': '6.089'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4465/5680 [11:08:26<2:39:04,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4466/5680 [11:08:34<2:38:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5443', 'grad_norm': '0.4787', 'learning_rate': '2.174e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 36585472, 'tokens/trainable': 36189536, 'epoch': '6.089'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4466/5680 [11:08:34<2:38:57,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4467/5680 [11:08:42<2:38:32,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.488', 'grad_norm': '0.4003', 'learning_rate': '2.171e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36593664, 'tokens/trainable': 36197648, 'epoch': '6.089'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4467/5680 [11:08:42<2:38:32,  7.84s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4468/5680 [11:08:49<2:38:17,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6558', 'grad_norm': '0.4078', 'learning_rate': '2.167e-05', 'ppl': '1.927', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 36601856, 'tokens/trainable': 36205688, 'epoch': '6.089'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 4468/5680 [11:08:49<2:38:17,  7.84s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 4469/5680 [11:08:57<2:38:13,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4168', 'grad_norm': '0.4008', 'learning_rate': '2.164e-05', 'ppl': '1.517', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36610048, 'tokens/trainable': 36213840, 'epoch': '6.09'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 4469/5680 [11:08:57<2:38:13,  7.84s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 4470/5680 [11:09:05<2:38:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3711', 'grad_norm': '0.4427', 'learning_rate': '2.161e-05', 'ppl': '1.449', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 36618240, 'tokens/trainable': 36221896, 'epoch': '6.09'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 4470/5680 [11:09:05<2:38:25,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 4471/5680 [11:09:13<2:38:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.37', 'grad_norm': '0.3605', 'learning_rate': '2.157e-05', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36626432, 'tokens/trainable': 36230064, 'epoch': '6.09'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 4471/5680 [11:09:13<2:38:21,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4472/5680 [11:09:21<2:38:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5689', 'grad_norm': '0.385', 'learning_rate': '2.154e-05', 'ppl': '1.766', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 36634624, 'tokens/trainable': 36238216, 'epoch': '6.09'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4472/5680 [11:09:21<2:38:11,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4473/5680 [11:09:29<2:37:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3277', 'grad_norm': '0.3508', 'learning_rate': '2.15e-05', 'ppl': '1.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 36642816, 'tokens/trainable': 36246384, 'epoch': '6.09'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4473/5680 [11:09:29<2:37:58,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4474/5680 [11:09:37<2:38:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5004', 'grad_norm': '0.4247', 'learning_rate': '2.147e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 36651008, 'tokens/trainable': 36254484, 'epoch': '6.09'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4474/5680 [11:09:37<2:38:02,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4475/5680 [11:09:45<2:37:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.432', 'grad_norm': '0.4801', 'learning_rate': '2.143e-05', 'ppl': '1.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 36659200, 'tokens/trainable': 36262448, 'epoch': '6.091'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4475/5680 [11:09:45<2:37:47,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4476/5680 [11:09:52<2:37:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4738', 'grad_norm': '0.4363', 'learning_rate': '2.14e-05', 'ppl': '1.606', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 36667392, 'tokens/trainable': 36270400, 'epoch': '6.091'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4476/5680 [11:09:52<2:37:42,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4477/5680 [11:10:00<2:37:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5433', 'grad_norm': '0.428', 'learning_rate': '2.137e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36675584, 'tokens/trainable': 36278504, 'epoch': '6.091'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4477/5680 [11:10:00<2:37:23,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4478/5680 [11:10:08<2:37:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5942', 'grad_norm': '0.6954', 'learning_rate': '2.133e-05', 'ppl': '1.812', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 36683776, 'tokens/trainable': 36286576, 'epoch': '6.091'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4478/5680 [11:10:08<2:37:20,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4479/5680 [11:10:16<2:37:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2402', 'grad_norm': '0.359', 'learning_rate': '2.13e-05', 'ppl': '1.271', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 36691968, 'tokens/trainable': 36294716, 'epoch': '6.091'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 4479/5680 [11:10:16<2:37:31,  7.87s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4480/5680 [11:10:24<2:37:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4352', 'grad_norm': '0.4418', 'learning_rate': '2.126e-05', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 36700160, 'tokens/trainable': 36302772, 'epoch': '6.092'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4480/5680 [11:10:24<2:37:19,  7.87s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4481/5680 [11:10:32<2:37:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2747', 'grad_norm': '0.411', 'learning_rate': '2.123e-05', 'ppl': '1.316', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36708352, 'tokens/trainable': 36310944, 'epoch': '6.092'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4481/5680 [11:10:32<2:37:11,  7.87s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4482/5680 [11:10:40<2:36:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2341', 'grad_norm': '0.3923', 'learning_rate': '2.12e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 36716544, 'tokens/trainable': 36319024, 'epoch': '6.092'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4482/5680 [11:10:40<2:36:54,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4483/5680 [11:10:47<2:36:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5094', 'grad_norm': '0.4489', 'learning_rate': '2.116e-05', 'ppl': '1.664', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 36724736, 'tokens/trainable': 36327056, 'epoch': '6.092'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 4483/5680 [11:10:47<2:36:39,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 4484/5680 [11:10:55<2:36:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.437', 'grad_norm': '0.3886', 'learning_rate': '2.113e-05', 'ppl': '1.548', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36732928, 'tokens/trainable': 36335196, 'epoch': '6.092'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 4484/5680 [11:10:55<2:36:23,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 4485/5680 [11:11:03<2:36:21,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.236', 'grad_norm': '0.3728', 'learning_rate': '2.109e-05', 'ppl': '1.266', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 36741120, 'tokens/trainable': 36343184, 'epoch': '6.092'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 4485/5680 [11:11:03<2:36:21,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 4486/5680 [11:11:11<2:36:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4167', 'grad_norm': '0.376', 'learning_rate': '2.106e-05', 'ppl': '1.517', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 36749312, 'tokens/trainable': 36351216, 'epoch': '6.093'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 4486/5680 [11:11:11<2:36:21,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4487/5680 [11:11:19<2:36:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4069', 'grad_norm': '0.346', 'learning_rate': '2.103e-05', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 36757504, 'tokens/trainable': 36359264, 'epoch': '6.093'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4487/5680 [11:11:19<2:36:26,  7.87s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4488/5680 [11:11:27<2:36:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4935', 'grad_norm': '0.4544', 'learning_rate': '2.099e-05', 'ppl': '1.638', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 36765696, 'tokens/trainable': 36367216, 'epoch': '6.093'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4488/5680 [11:11:27<2:36:04,  7.86s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4489/5680 [11:11:35<2:35:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7925', 'grad_norm': '0.4323', 'learning_rate': '2.096e-05', 'ppl': '2.209', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36773888, 'tokens/trainable': 36375372, 'epoch': '6.093'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4489/5680 [11:11:35<2:35:51,  7.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4490/5680 [11:11:42<2:35:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5333', 'grad_norm': '0.3803', 'learning_rate': '2.092e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36782080, 'tokens/trainable': 36383504, 'epoch': '6.093'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 4490/5680 [11:11:42<2:35:50,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4491/5680 [11:11:50<2:35:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7859', 'grad_norm': '0.4312', 'learning_rate': '2.089e-05', 'ppl': '2.194', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36790272, 'tokens/trainable': 36391612, 'epoch': '6.093'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4491/5680 [11:11:50<2:35:38,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4492/5680 [11:11:58<2:35:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3236', 'grad_norm': '0.4094', 'learning_rate': '2.086e-05', 'ppl': '1.382', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 36798464, 'tokens/trainable': 36399796, 'epoch': '6.094'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4492/5680 [11:11:58<2:35:38,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4493/5680 [11:12:06<2:35:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4976', 'grad_norm': '0.4604', 'learning_rate': '2.082e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 36806656, 'tokens/trainable': 36407828, 'epoch': '6.094'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4493/5680 [11:12:06<2:35:42,  7.87s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4494/5680 [11:12:14<2:35:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5548', 'grad_norm': '0.3702', 'learning_rate': '2.079e-05', 'ppl': '1.742', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 36814848, 'tokens/trainable': 36415956, 'epoch': '6.094'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4494/5680 [11:12:14<2:35:36,  7.87s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4495/5680 [11:12:22<2:35:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5307', 'grad_norm': '0.427', 'learning_rate': '2.075e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 36823040, 'tokens/trainable': 36424000, 'epoch': '6.094'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4495/5680 [11:12:22<2:35:29,  7.87s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4496/5680 [11:12:30<2:35:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3974', 'grad_norm': '0.3938', 'learning_rate': '2.072e-05', 'ppl': '1.488', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 36831232, 'tokens/trainable': 36431880, 'epoch': '6.094'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4496/5680 [11:12:30<2:35:10,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4497/5680 [11:12:37<2:34:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6595', 'grad_norm': '0.4031', 'learning_rate': '2.069e-05', 'ppl': '1.934', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 36839424, 'tokens/trainable': 36439896, 'epoch': '6.095'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4497/5680 [11:12:37<2:34:48,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4498/5680 [11:12:45<2:34:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5571', 'grad_norm': '0.3873', 'learning_rate': '2.065e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 36847616, 'tokens/trainable': 36448032, 'epoch': '6.095'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4498/5680 [11:12:45<2:34:44,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4499/5680 [11:12:53<2:34:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5364', 'grad_norm': '0.3853', 'learning_rate': '2.062e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 36855808, 'tokens/trainable': 36456204, 'epoch': '6.095'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4499/5680 [11:12:53<2:34:35,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4500/5680 [11:13:01<2:34:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4455', 'grad_norm': '0.3832', 'learning_rate': '2.059e-05', 'ppl': '1.561', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 36864000, 'tokens/trainable': 36464168, 'epoch': '6.095'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4500/5680 [11:13:01<2:34:29,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4501/5680 [11:13:09<2:34:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6455', 'grad_norm': '0.4413', 'learning_rate': '2.055e-05', 'ppl': '1.907', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36872192, 'tokens/trainable': 36472256, 'epoch': '6.095'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 4501/5680 [11:13:09<2:34:11,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4502/5680 [11:13:17<2:34:00,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3521', 'grad_norm': '0.404', 'learning_rate': '2.052e-05', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 36880384, 'tokens/trainable': 36480096, 'epoch': '6.095'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4502/5680 [11:13:17<2:34:00,  7.84s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4503/5680 [11:13:25<2:33:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4068', 'grad_norm': '0.4672', 'learning_rate': '2.049e-05', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 36888576, 'tokens/trainable': 36488208, 'epoch': '6.096'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4503/5680 [11:13:25<2:33:57,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4504/5680 [11:13:32<2:34:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3022', 'grad_norm': '0.3981', 'learning_rate': '2.045e-05', 'ppl': '1.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 36896768, 'tokens/trainable': 36496312, 'epoch': '6.096'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4504/5680 [11:13:32<2:34:05,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4505/5680 [11:13:40<2:33:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4201', 'grad_norm': '0.3748', 'learning_rate': '2.042e-05', 'ppl': '1.522', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 36904960, 'tokens/trainable': 36504332, 'epoch': '6.096'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 4505/5680 [11:13:40<2:33:54,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4506/5680 [11:13:48<2:33:40,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6031', 'grad_norm': '0.4254', 'learning_rate': '2.038e-05', 'ppl': '1.828', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 36913152, 'tokens/trainable': 36512260, 'epoch': '6.096'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4506/5680 [11:13:48<2:33:40,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4507/5680 [11:13:56<2:33:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5041', 'grad_norm': '0.3724', 'learning_rate': '2.035e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.2', 'tokens/total': 36921344, 'tokens/trainable': 36520048, 'epoch': '6.096'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4507/5680 [11:13:56<2:33:42,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4508/5680 [11:14:04<2:33:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2713', 'grad_norm': '0.3882', 'learning_rate': '2.032e-05', 'ppl': '1.312', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 36929536, 'tokens/trainable': 36527972, 'epoch': '6.096'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4508/5680 [11:14:04<2:33:46,  7.87s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4509/5680 [11:14:12<2:33:48,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4401', 'grad_norm': '0.4264', 'learning_rate': '2.028e-05', 'ppl': '1.553', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.1', 'tokens/total': 36937728, 'tokens/trainable': 36535832, 'epoch': '6.097'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 4509/5680 [11:14:12<2:33:48,  7.88s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 4510/5680 [11:14:20<2:33:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.7385', 'grad_norm': '0.4274', 'learning_rate': '2.025e-05', 'ppl': '2.093', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 36945920, 'tokens/trainable': 36544016, 'epoch': '6.097'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 4510/5680 [11:14:20<2:33:34,  7.88s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 4511/5680 [11:14:27<2:33:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4579', 'grad_norm': '0.3831', 'learning_rate': '2.022e-05', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 36954112, 'tokens/trainable': 36551928, 'epoch': '6.097'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 4511/5680 [11:14:27<2:33:07,  7.86s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 4512/5680 [11:14:35<2:32:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4845', 'grad_norm': '0.4049', 'learning_rate': '2.018e-05', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 36962304, 'tokens/trainable': 36560056, 'epoch': '6.097'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 4512/5680 [11:14:35<2:32:43,  7.85s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4513/5680 [11:14:43<2:34:09,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3709', 'grad_norm': '0.3666', 'learning_rate': '2.015e-05', 'ppl': '1.449', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.7', 'tokens/total': 36970496, 'tokens/trainable': 36568132, 'epoch': '6.097'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4513/5680 [11:14:43<2:34:09,  7.93s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4514/5680 [11:14:51<2:33:46,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3493', 'grad_norm': '0.3171', 'learning_rate': '2.012e-05', 'ppl': '1.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 36978688, 'tokens/trainable': 36576224, 'epoch': '6.098'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4514/5680 [11:14:51<2:33:46,  7.91s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4515/5680 [11:14:59<2:33:13,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5412', 'grad_norm': '0.38', 'learning_rate': '2.008e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 36986880, 'tokens/trainable': 36584304, 'epoch': '6.098'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4515/5680 [11:14:59<2:33:13,  7.89s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4516/5680 [11:15:07<2:32:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3819', 'grad_norm': '0.4444', 'learning_rate': '2.005e-05', 'ppl': '1.465', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.7', 'tokens/total': 36995072, 'tokens/trainable': 36592092, 'epoch': '6.098'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 4516/5680 [11:15:07<2:32:44,  7.87s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4517/5680 [11:15:15<2:32:47,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2374', 'grad_norm': '0.3342', 'learning_rate': '2.002e-05', 'ppl': '1.268', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.9', 'tokens/total': 37003264, 'tokens/trainable': 36599984, 'epoch': '6.098'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4517/5680 [11:15:15<2:32:47,  7.88s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4518/5680 [11:15:23<2:32:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4903', 'grad_norm': '0.4308', 'learning_rate': '1.999e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.7', 'tokens/total': 37011456, 'tokens/trainable': 36607820, 'epoch': '6.098'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4518/5680 [11:15:23<2:32:27,  7.87s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4519/5680 [11:15:31<2:32:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3395', 'grad_norm': '0.3559', 'learning_rate': '1.995e-05', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 37019648, 'tokens/trainable': 36615864, 'epoch': '6.098'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4519/5680 [11:15:31<2:32:13,  7.87s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4520/5680 [11:15:38<2:31:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4972', 'grad_norm': '0.4433', 'learning_rate': '1.992e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 37027840, 'tokens/trainable': 36623872, 'epoch': '6.099'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 4520/5680 [11:15:38<2:31:53,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 4521/5680 [11:15:46<2:31:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.28', 'grad_norm': '0.459', 'learning_rate': '1.989e-05', 'ppl': '1.323', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 37036032, 'tokens/trainable': 36631836, 'epoch': '6.099'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 4521/5680 [11:15:46<2:31:47,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 4522/5680 [11:15:54<2:31:34,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3439', 'grad_norm': '0.3619', 'learning_rate': '1.985e-05', 'ppl': '1.41', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 37044224, 'tokens/trainable': 36639764, 'epoch': '6.099'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 4522/5680 [11:15:54<2:31:34,  7.85s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 4523/5680 [11:16:02<2:31:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4554', 'grad_norm': '0.4304', 'learning_rate': '1.982e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.9', 'tokens/total': 37052416, 'tokens/trainable': 36647528, 'epoch': '6.099'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 4523/5680 [11:16:02<2:31:37,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4524/5680 [11:16:10<2:31:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4479', 'grad_norm': '0.4327', 'learning_rate': '1.979e-05', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 37060608, 'tokens/trainable': 36655456, 'epoch': '6.099'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4524/5680 [11:16:10<2:31:34,  7.87s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4525/5680 [11:16:18<2:31:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6403', 'grad_norm': '0.4844', 'learning_rate': '1.975e-05', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 37068800, 'tokens/trainable': 36663472, 'epoch': '6.099'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4525/5680 [11:16:18<2:31:29,  7.87s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4526/5680 [11:16:26<2:31:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4635', 'grad_norm': '0.4648', 'learning_rate': '1.972e-05', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 37076992, 'tokens/trainable': 36671500, 'epoch': '6.1'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4526/5680 [11:16:26<2:31:11,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4527/5680 [11:16:33<2:31:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3886', 'grad_norm': '0.3658', 'learning_rate': '1.969e-05', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.2', 'tokens/total': 37085184, 'tokens/trainable': 36679376, 'epoch': '6.1'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 4527/5680 [11:16:33<2:31:16,  7.87s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4528/5680 [11:16:41<2:31:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4045', 'grad_norm': '0.3745', 'learning_rate': '1.965e-05', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 37093376, 'tokens/trainable': 36687540, 'epoch': '6.1'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4528/5680 [11:16:41<2:31:06,  7.87s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4529/5680 [11:16:49<2:30:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6399', 'grad_norm': '0.4129', 'learning_rate': '1.962e-05', 'ppl': '1.896', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 37101568, 'tokens/trainable': 36695716, 'epoch': '6.1'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4529/5680 [11:16:49<2:30:49,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4530/5680 [11:16:57<2:30:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3698', 'grad_norm': '0.4567', 'learning_rate': '1.959e-05', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 37109760, 'tokens/trainable': 36703648, 'epoch': '6.1'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4530/5680 [11:16:57<2:30:41,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4531/5680 [11:17:05<2:30:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6205', 'grad_norm': '0.5281', 'learning_rate': '1.956e-05', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 37117952, 'tokens/trainable': 36711800, 'epoch': '6.101'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 4531/5680 [11:17:05<2:30:29,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4532/5680 [11:17:13<2:30:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6129', 'grad_norm': '0.4839', 'learning_rate': '1.952e-05', 'ppl': '1.846', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 37126144, 'tokens/trainable': 36719880, 'epoch': '6.101'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4532/5680 [11:17:13<2:30:21,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4533/5680 [11:17:21<2:30:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5188', 'grad_norm': '0.4302', 'learning_rate': '1.949e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '973', 'tokens/total': 37134336, 'tokens/trainable': 36727520, 'epoch': '6.101'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4533/5680 [11:17:21<2:30:12,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4534/5680 [11:17:28<2:30:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4205', 'grad_norm': '0.4382', 'learning_rate': '1.946e-05', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 37142528, 'tokens/trainable': 36735628, 'epoch': '6.101'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4534/5680 [11:17:28<2:30:04,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4535/5680 [11:17:36<2:29:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4562', 'grad_norm': '0.4179', 'learning_rate': '1.942e-05', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999', 'tokens/total': 37150720, 'tokens/trainable': 36743464, 'epoch': '6.101'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 4535/5680 [11:17:36<2:29:53,  7.85s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 4536/5680 [11:17:44<2:29:34,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4805', 'grad_norm': '0.4241', 'learning_rate': '1.939e-05', 'ppl': '1.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 37158912, 'tokens/trainable': 36751572, 'epoch': '6.101'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 4536/5680 [11:17:44<2:29:34,  7.84s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 4537/5680 [11:17:52<2:29:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.239', 'grad_norm': '0.3621', 'learning_rate': '1.936e-05', 'ppl': '1.27', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.5', 'tokens/total': 37167104, 'tokens/trainable': 36759372, 'epoch': '6.102'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 4537/5680 [11:17:52<2:29:45,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 4538/5680 [11:18:00<2:29:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5081', 'grad_norm': '0.3897', 'learning_rate': '1.933e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 37175296, 'tokens/trainable': 36767256, 'epoch': '6.102'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 4538/5680 [11:18:00<2:29:28,  7.85s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4539/5680 [11:18:08<2:29:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3473', 'grad_norm': '0.3671', 'learning_rate': '1.929e-05', 'ppl': '1.415', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 37183488, 'tokens/trainable': 36775300, 'epoch': '6.102'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4539/5680 [11:18:08<2:29:20,  7.85s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4540/5680 [11:18:16<2:29:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.574', 'grad_norm': '0.4855', 'learning_rate': '1.926e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.8', 'tokens/total': 37191680, 'tokens/trainable': 36783120, 'epoch': '6.102'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4540/5680 [11:18:16<2:29:12,  7.85s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4541/5680 [11:18:24<2:30:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5287', 'grad_norm': '0.4763', 'learning_rate': '1.923e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '946', 'tokens/total': 37199872, 'tokens/trainable': 36790832, 'epoch': '6.102'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4541/5680 [11:18:24<2:30:47,  7.94s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4542/5680 [11:18:32<2:30:24,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.353', 'grad_norm': '0.3778', 'learning_rate': '1.92e-05', 'ppl': '1.423', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '942.2', 'tokens/total': 37208064, 'tokens/trainable': 36798272, 'epoch': '6.102'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 4542/5680 [11:18:32<2:30:24,  7.93s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4543/5680 [11:18:39<2:29:33,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4031', 'grad_norm': '0.3846', 'learning_rate': '1.916e-05', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 37216256, 'tokens/trainable': 36806280, 'epoch': '6.103'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4543/5680 [11:18:39<2:29:33,  7.89s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4544/5680 [11:18:47<2:29:16,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2275', 'grad_norm': '0.3493', 'learning_rate': '1.913e-05', 'ppl': '1.256', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.8', 'tokens/total': 37224448, 'tokens/trainable': 36814024, 'epoch': '6.103'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4544/5680 [11:18:47<2:29:16,  7.88s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4545/5680 [11:18:55<2:28:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4552', 'grad_norm': '0.4428', 'learning_rate': '1.91e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 37232640, 'tokens/trainable': 36821872, 'epoch': '6.103'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4545/5680 [11:18:55<2:28:50,  7.87s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4546/5680 [11:19:03<2:28:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4848', 'grad_norm': '0.4349', 'learning_rate': '1.907e-05', 'ppl': '1.624', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 37240832, 'tokens/trainable': 36829976, 'epoch': '6.103'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 4546/5680 [11:19:03<2:28:36,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 4547/5680 [11:19:11<2:28:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3723', 'grad_norm': '0.4002', 'learning_rate': '1.903e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.8', 'tokens/total': 37249024, 'tokens/trainable': 36837624, 'epoch': '6.103'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 4547/5680 [11:19:11<2:28:21,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 4548/5680 [11:19:19<2:28:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2762', 'grad_norm': '0.4166', 'learning_rate': '1.9e-05', 'ppl': '1.318', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '971.8', 'tokens/total': 37257216, 'tokens/trainable': 36845252, 'epoch': '6.104'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 4548/5680 [11:19:19<2:28:12,  7.86s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 4549/5680 [11:19:27<2:28:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3808', 'grad_norm': '0.3546', 'learning_rate': '1.897e-05', 'ppl': '1.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 37265408, 'tokens/trainable': 36853244, 'epoch': '6.104'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 4549/5680 [11:19:27<2:28:12,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4550/5680 [11:19:34<2:28:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4875', 'grad_norm': '0.416', 'learning_rate': '1.894e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '958.8', 'tokens/total': 37273600, 'tokens/trainable': 36860792, 'epoch': '6.104'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4550/5680 [11:19:34<2:28:08,  7.87s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4551/5680 [11:19:42<2:28:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6452', 'grad_norm': '0.4361', 'learning_rate': '1.89e-05', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 37281792, 'tokens/trainable': 36868772, 'epoch': '6.104'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4551/5680 [11:19:42<2:28:05,  7.87s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4552/5680 [11:19:50<2:27:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.582', 'grad_norm': '0.4157', 'learning_rate': '1.887e-05', 'ppl': '1.79', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 37289984, 'tokens/trainable': 36876852, 'epoch': '6.104'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4552/5680 [11:19:50<2:27:48,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4553/5680 [11:19:58<2:28:01,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5722', 'grad_norm': '0.5796', 'learning_rate': '1.884e-05', 'ppl': '1.772', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '962.3', 'tokens/total': 37298176, 'tokens/trainable': 36884476, 'epoch': '6.104'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 4553/5680 [11:19:58<2:28:01,  7.88s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4554/5680 [11:20:06<2:27:48,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.592', 'grad_norm': '0.5555', 'learning_rate': '1.881e-05', 'ppl': '1.808', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.8', 'tokens/total': 37306368, 'tokens/trainable': 36892220, 'epoch': '6.105'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4554/5680 [11:20:06<2:27:48,  7.88s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4555/5680 [11:20:14<2:27:47,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3451', 'grad_norm': '0.3918', 'learning_rate': '1.877e-05', 'ppl': '1.412', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 37314560, 'tokens/trainable': 36900260, 'epoch': '6.105'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4555/5680 [11:20:14<2:27:47,  7.88s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4556/5680 [11:20:22<2:27:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4794', 'grad_norm': '0.3993', 'learning_rate': '1.874e-05', 'ppl': '1.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 37322752, 'tokens/trainable': 36908308, 'epoch': '6.105'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4556/5680 [11:20:22<2:27:44,  7.89s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4557/5680 [11:20:30<2:27:33,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.437', 'grad_norm': '0.4148', 'learning_rate': '1.871e-05', 'ppl': '1.548', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999', 'tokens/total': 37330944, 'tokens/trainable': 36916176, 'epoch': '6.105'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 4557/5680 [11:20:30<2:27:33,  7.88s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4558/5680 [11:20:37<2:27:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4129', 'grad_norm': '0.4042', 'learning_rate': '1.868e-05', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 37339136, 'tokens/trainable': 36924304, 'epoch': '6.105'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4558/5680 [11:20:37<2:27:03,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4559/5680 [11:20:45<2:26:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4215', 'grad_norm': '0.3693', 'learning_rate': '1.865e-05', 'ppl': '1.524', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '981.6', 'tokens/total': 37347328, 'tokens/trainable': 36932008, 'epoch': '6.105'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4559/5680 [11:20:45<2:26:51,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4560/5680 [11:20:53<2:26:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4754', 'grad_norm': '0.4244', 'learning_rate': '1.861e-05', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.3', 'tokens/total': 37355520, 'tokens/trainable': 36939772, 'epoch': '6.106'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4560/5680 [11:20:53<2:26:42,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4561/5680 [11:21:01<2:26:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4472', 'grad_norm': '0.3936', 'learning_rate': '1.858e-05', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 37363712, 'tokens/trainable': 36947952, 'epoch': '6.106'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 4561/5680 [11:21:01<2:26:34,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 4562/5680 [11:21:09<2:26:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6211', 'grad_norm': '0.4249', 'learning_rate': '1.855e-05', 'ppl': '1.861', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 37371904, 'tokens/trainable': 36955896, 'epoch': '6.106'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 4562/5680 [11:21:09<2:26:30,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 4563/5680 [11:21:17<2:26:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6153', 'grad_norm': '0.4915', 'learning_rate': '1.852e-05', 'ppl': '1.85', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '968.4', 'tokens/total': 37380096, 'tokens/trainable': 36963504, 'epoch': '6.106'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 4563/5680 [11:21:17<2:26:21,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 4564/5680 [11:21:25<2:26:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.618', 'grad_norm': '0.4423', 'learning_rate': '1.849e-05', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.6', 'tokens/total': 37388288, 'tokens/trainable': 36970996, 'epoch': '6.106'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 4564/5680 [11:21:25<2:26:23,  7.87s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4565/5680 [11:21:32<2:26:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5732', 'grad_norm': '0.4562', 'learning_rate': '1.845e-05', 'ppl': '1.774', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 37396480, 'tokens/trainable': 36978864, 'epoch': '6.107'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4565/5680 [11:21:32<2:26:08,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4566/5680 [11:21:40<2:25:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3345', 'grad_norm': '0.4538', 'learning_rate': '1.842e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996.8', 'tokens/total': 37404672, 'tokens/trainable': 36986692, 'epoch': '6.107'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4566/5680 [11:21:40<2:25:56,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4567/5680 [11:21:48<2:25:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3959', 'grad_norm': '0.4813', 'learning_rate': '1.839e-05', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 37412864, 'tokens/trainable': 36994664, 'epoch': '6.107'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4567/5680 [11:21:48<2:25:50,  7.86s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4568/5680 [11:21:56<2:25:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3919', 'grad_norm': '0.3727', 'learning_rate': '1.836e-05', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 37421056, 'tokens/trainable': 37002736, 'epoch': '6.107'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 4568/5680 [11:21:56<2:25:53,  7.87s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4569/5680 [11:22:04<2:25:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4703', 'grad_norm': '0.4171', 'learning_rate': '1.833e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 37429248, 'tokens/trainable': 37010708, 'epoch': '6.107'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4569/5680 [11:22:04<2:25:45,  7.87s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4570/5680 [11:22:12<2:25:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3014', 'grad_norm': '0.3812', 'learning_rate': '1.829e-05', 'ppl': '1.352', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 37437440, 'tokens/trainable': 37018620, 'epoch': '6.107'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4570/5680 [11:22:12<2:25:34,  7.87s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4571/5680 [11:22:20<2:25:09,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3307', 'grad_norm': '0.3994', 'learning_rate': '1.826e-05', 'ppl': '1.392', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 37445632, 'tokens/trainable': 37026736, 'epoch': '6.108'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4571/5680 [11:22:20<2:25:09,  7.85s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4572/5680 [11:22:27<2:24:49,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4609', 'grad_norm': '0.415', 'learning_rate': '1.823e-05', 'ppl': '1.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 37453824, 'tokens/trainable': 37034620, 'epoch': '6.108'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 4572/5680 [11:22:27<2:24:49,  7.84s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 4573/5680 [11:22:35<2:24:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6379', 'grad_norm': '0.4192', 'learning_rate': '1.82e-05', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 37462016, 'tokens/trainable': 37042744, 'epoch': '6.108'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 4573/5680 [11:22:35<2:24:36,  7.84s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 4574/5680 [11:22:43<2:24:20,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4628', 'grad_norm': '0.4253', 'learning_rate': '1.817e-05', 'ppl': '1.589', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 37470208, 'tokens/trainable': 37050700, 'epoch': '6.108'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 4574/5680 [11:22:43<2:24:20,  7.83s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 4575/5680 [11:22:51<2:24:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3842', 'grad_norm': '0.4318', 'learning_rate': '1.813e-05', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.5', 'tokens/total': 37478400, 'tokens/trainable': 37058360, 'epoch': '6.108'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 4575/5680 [11:22:51<2:24:29,  7.85s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4576/5680 [11:22:59<2:24:24,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4058', 'grad_norm': '0.3587', 'learning_rate': '1.81e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 37486592, 'tokens/trainable': 37066424, 'epoch': '6.108'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4576/5680 [11:22:59<2:24:24,  7.85s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4577/5680 [11:23:07<2:24:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3203', 'grad_norm': '0.3415', 'learning_rate': '1.807e-05', 'ppl': '1.377', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 37494784, 'tokens/trainable': 37074384, 'epoch': '6.109'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4577/5680 [11:23:07<2:24:27,  7.86s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4578/5680 [11:23:15<2:24:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2643', 'grad_norm': '0.403', 'learning_rate': '1.804e-05', 'ppl': '1.302', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 37502976, 'tokens/trainable': 37082544, 'epoch': '6.109'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4578/5680 [11:23:15<2:24:30,  7.87s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4579/5680 [11:23:22<2:24:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4419', 'grad_norm': '0.4564', 'learning_rate': '1.801e-05', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 37511168, 'tokens/trainable': 37090400, 'epoch': '6.109'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4579/5680 [11:23:22<2:24:25,  7.87s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4580/5680 [11:23:30<2:24:28,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5748', 'grad_norm': '0.4181', 'learning_rate': '1.798e-05', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 37519360, 'tokens/trainable': 37098560, 'epoch': '6.109'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4580/5680 [11:23:30<2:24:28,  7.88s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4581/5680 [11:23:38<2:24:35,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3945', 'grad_norm': '0.4459', 'learning_rate': '1.794e-05', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 37527552, 'tokens/trainable': 37106624, 'epoch': '6.109'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4581/5680 [11:23:38<2:24:35,  7.89s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4582/5680 [11:23:46<2:24:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6239', 'grad_norm': '0.5726', 'learning_rate': '1.791e-05', 'ppl': '1.866', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '954.1', 'tokens/total': 37535744, 'tokens/trainable': 37114076, 'epoch': '6.11'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4582/5680 [11:23:46<2:24:00,  7.87s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4583/5680 [11:23:54<2:23:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3933', 'grad_norm': '0.4023', 'learning_rate': '1.788e-05', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 37543936, 'tokens/trainable': 37121968, 'epoch': '6.11'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 4583/5680 [11:23:54<2:23:59,  7.88s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4584/5680 [11:24:02<2:23:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3745', 'grad_norm': '0.4199', 'learning_rate': '1.785e-05', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 37552128, 'tokens/trainable': 37129908, 'epoch': '6.11'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4584/5680 [11:24:02<2:23:44,  7.87s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4585/5680 [11:24:10<2:23:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5686', 'grad_norm': '0.4608', 'learning_rate': '1.782e-05', 'ppl': '1.766', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 37560320, 'tokens/trainable': 37138020, 'epoch': '6.11'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4585/5680 [11:24:10<2:23:21,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4586/5680 [11:24:18<2:23:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3026', 'grad_norm': '0.3638', 'learning_rate': '1.779e-05', 'ppl': '1.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 37568512, 'tokens/trainable': 37145988, 'epoch': '6.11'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4586/5680 [11:24:18<2:23:16,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4587/5680 [11:24:25<2:23:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.286', 'grad_norm': '0.3558', 'learning_rate': '1.775e-05', 'ppl': '1.331', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 37576704, 'tokens/trainable': 37154000, 'epoch': '6.11'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 4587/5680 [11:24:25<2:23:01,  7.85s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 4588/5680 [11:24:33<2:23:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.1727', 'grad_norm': '0.334', 'learning_rate': '1.772e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '943.8', 'tokens/total': 37584896, 'tokens/trainable': 37161432, 'epoch': '6.111'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 4588/5680 [11:24:33<2:23:01,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 4589/5680 [11:24:41<2:22:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5024', 'grad_norm': '0.4426', 'learning_rate': '1.769e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 37593088, 'tokens/trainable': 37169312, 'epoch': '6.111'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 4589/5680 [11:24:41<2:22:59,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 4590/5680 [11:24:49<2:22:36,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4033', 'grad_norm': '0.4198', 'learning_rate': '1.766e-05', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 37601280, 'tokens/trainable': 37177464, 'epoch': '6.111'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 4590/5680 [11:24:49<2:22:36,  7.85s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4591/5680 [11:24:57<2:22:15,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6696', 'grad_norm': '0.5626', 'learning_rate': '1.763e-05', 'ppl': '1.953', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 37609472, 'tokens/trainable': 37185320, 'epoch': '6.111'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4591/5680 [11:24:57<2:22:15,  7.84s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4592/5680 [11:25:05<2:22:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3487', 'grad_norm': '0.4113', 'learning_rate': '1.76e-05', 'ppl': '1.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974', 'tokens/total': 37617664, 'tokens/trainable': 37192992, 'epoch': '6.111'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4592/5680 [11:25:05<2:22:19,  7.85s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4593/5680 [11:25:13<2:22:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.373', 'grad_norm': '0.4296', 'learning_rate': '1.757e-05', 'ppl': '1.452', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 37625856, 'tokens/trainable': 37200932, 'epoch': '6.111'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4593/5680 [11:25:13<2:22:28,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4594/5680 [11:25:20<2:22:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8174', 'grad_norm': '0.4491', 'learning_rate': '1.754e-05', 'ppl': '2.265', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.9', 'tokens/total': 37634048, 'tokens/trainable': 37208664, 'epoch': '6.112'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 4594/5680 [11:25:20<2:22:13,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4595/5680 [11:25:28<2:21:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4763', 'grad_norm': '0.4309', 'learning_rate': '1.75e-05', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 37642240, 'tokens/trainable': 37216568, 'epoch': '6.112'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4595/5680 [11:25:28<2:21:59,  7.85s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4596/5680 [11:25:36<2:21:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5108', 'grad_norm': '0.4402', 'learning_rate': '1.747e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 37650432, 'tokens/trainable': 37224628, 'epoch': '6.112'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4596/5680 [11:25:36<2:21:58,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4597/5680 [11:25:44<2:21:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7148', 'grad_norm': '0.5789', 'learning_rate': '1.744e-05', 'ppl': '2.044', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.6', 'tokens/total': 37658624, 'tokens/trainable': 37232392, 'epoch': '6.112'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4597/5680 [11:25:44<2:21:38,  7.85s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4598/5680 [11:25:52<2:21:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2778', 'grad_norm': '0.3947', 'learning_rate': '1.741e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 37666816, 'tokens/trainable': 37240376, 'epoch': '6.112'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 4598/5680 [11:25:52<2:21:49,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 4599/5680 [11:26:00<2:21:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2965', 'grad_norm': '0.4099', 'learning_rate': '1.738e-05', 'ppl': '1.345', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 37675008, 'tokens/trainable': 37248520, 'epoch': '6.112'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 4599/5680 [11:26:00<2:21:32,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 4600/5680 [11:26:08<2:21:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6207', 'grad_norm': '0.4645', 'learning_rate': '1.735e-05', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 37683200, 'tokens/trainable': 37256384, 'epoch': '6.113'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 4600/5680 [11:26:08<2:21:24,  7.86s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 4601/5680 [11:26:15<2:21:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4054', 'grad_norm': '0.4531', 'learning_rate': '1.732e-05', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.2', 'tokens/total': 37691392, 'tokens/trainable': 37264004, 'epoch': '6.113'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 4601/5680 [11:26:15<2:21:32,  7.87s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4602/5680 [11:26:23<2:21:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4558', 'grad_norm': '0.3931', 'learning_rate': '1.729e-05', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995', 'tokens/total': 37699584, 'tokens/trainable': 37271824, 'epoch': '6.113'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4602/5680 [11:26:23<2:21:21,  7.87s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4603/5680 [11:26:31<2:21:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5499', 'grad_norm': '0.3917', 'learning_rate': '1.725e-05', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 37707776, 'tokens/trainable': 37279944, 'epoch': '6.113'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4603/5680 [11:26:31<2:21:29,  7.88s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4604/5680 [11:26:39<2:21:46,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4289', 'grad_norm': '0.4362', 'learning_rate': '1.722e-05', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '966.9', 'tokens/total': 37715968, 'tokens/trainable': 37287640, 'epoch': '6.113'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4604/5680 [11:26:39<2:21:46,  7.91s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4605/5680 [11:26:47<2:21:44,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5072', 'grad_norm': '0.3952', 'learning_rate': '1.719e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.1', 'tokens/total': 37724160, 'tokens/trainable': 37295500, 'epoch': '6.114'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 4605/5680 [11:26:47<2:21:44,  7.91s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4606/5680 [11:26:55<2:21:44,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.1997', 'grad_norm': '0.3348', 'learning_rate': '1.716e-05', 'ppl': '1.221', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '973.5', 'tokens/total': 37732352, 'tokens/trainable': 37303224, 'epoch': '6.114'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4606/5680 [11:26:55<2:21:44,  7.92s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4607/5680 [11:27:03<2:21:29,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.406', 'grad_norm': '0.4429', 'learning_rate': '1.713e-05', 'ppl': '1.501', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.6', 'tokens/total': 37740544, 'tokens/trainable': 37310720, 'epoch': '6.114'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4607/5680 [11:27:03<2:21:29,  7.91s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4608/5680 [11:27:11<2:21:13,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5761', 'grad_norm': '0.4285', 'learning_rate': '1.71e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990', 'tokens/total': 37748736, 'tokens/trainable': 37318528, 'epoch': '6.114'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4608/5680 [11:27:11<2:21:13,  7.90s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4609/5680 [11:27:19<2:20:43,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3753', 'grad_norm': '0.3831', 'learning_rate': '1.707e-05', 'ppl': '1.455', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 37756928, 'tokens/trainable': 37326436, 'epoch': '6.114'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 4609/5680 [11:27:19<2:20:43,  7.88s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4610/5680 [11:27:27<2:20:56,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3891', 'grad_norm': '0.3835', 'learning_rate': '1.704e-05', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.5', 'tokens/total': 37765120, 'tokens/trainable': 37334244, 'epoch': '6.114'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4610/5680 [11:27:27<2:20:56,  7.90s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4611/5680 [11:27:35<2:21:01,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5054', 'grad_norm': '0.4294', 'learning_rate': '1.701e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '994.3', 'tokens/total': 37773312, 'tokens/trainable': 37342140, 'epoch': '6.115'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4611/5680 [11:27:35<2:21:01,  7.91s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4612/5680 [11:27:42<2:20:58,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.522', 'grad_norm': '0.4478', 'learning_rate': '1.698e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 37781504, 'tokens/trainable': 37350172, 'epoch': '6.115'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4612/5680 [11:27:42<2:20:58,  7.92s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4613/5680 [11:27:51<2:22:01,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4917', 'grad_norm': '0.4342', 'learning_rate': '1.695e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978', 'tokens/total': 37789696, 'tokens/trainable': 37358132, 'epoch': '6.115'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 4613/5680 [11:27:51<2:22:01,  7.99s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 4614/5680 [11:27:58<2:21:14,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.5496', 'grad_norm': '0.4398', 'learning_rate': '1.691e-05', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.4', 'tokens/total': 37797888, 'tokens/trainable': 37365972, 'epoch': '6.115'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 4614/5680 [11:27:58<2:21:14,  7.95s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 4615/5680 [11:28:06<2:20:32,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5007', 'grad_norm': '0.4072', 'learning_rate': '1.688e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.5', 'tokens/total': 37806080, 'tokens/trainable': 37373748, 'epoch': '6.115'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 4615/5680 [11:28:06<2:20:32,  7.92s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 4616/5680 [11:28:14<2:20:17,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3976', 'grad_norm': '0.4016', 'learning_rate': '1.685e-05', 'ppl': '1.488', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '975.3', 'tokens/total': 37814272, 'tokens/trainable': 37381448, 'epoch': '6.115'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 4616/5680 [11:28:14<2:20:17,  7.91s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4617/5680 [11:28:22<2:19:53,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.2762', 'grad_norm': '0.3854', 'learning_rate': '1.682e-05', 'ppl': '1.318', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999', 'tokens/total': 37822464, 'tokens/trainable': 37389300, 'epoch': '6.116'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4617/5680 [11:28:22<2:19:53,  7.90s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4618/5680 [11:28:30<2:19:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2028', 'grad_norm': '0.3536', 'learning_rate': '1.679e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 37830656, 'tokens/trainable': 37397436, 'epoch': '6.116'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4618/5680 [11:28:30<2:19:23,  7.87s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4619/5680 [11:28:38<2:19:34,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5302', 'grad_norm': '0.4237', 'learning_rate': '1.676e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.8', 'tokens/total': 37838848, 'tokens/trainable': 37405240, 'epoch': '6.116'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4619/5680 [11:28:38<2:19:34,  7.89s/it][2026-01-27 09:17:51,974] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:61967] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 09:17:53,194] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:61967] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:04<03:42, 25.02 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:36, 56.74 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:04<01:01, 87.13 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:05<00:40, 130.20 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:33, 153.54 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:05<00:26, 189.66 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:06<00:22, 218.38 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:19, 243.45 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:06<00:17, 264.49 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:07<00:16, 277.28 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:07<00:13, 322.75 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:07<00:13, 329.03 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:08<00:15, 285.74 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:14, 290.90 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:08<00:13, 301.87 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:09<00:13, 304.34 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:09<00:12, 307.31 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:09<00:12, 310.72 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:10<00:11, 315.48 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:10<00:11, 319.62 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:10<00:10, 320.28 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:11<00:10, 321.05 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:11<00:10, 320.43 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:11<00:09, 323.35 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:12<00:09, 327.19 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:12<00:08, 328.08 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:12<00:08, 330.37 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:13<00:08, 337.37 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:13<00:07, 332.16 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:13<00:07, 333.54 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:13<00:07, 332.16 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:14<00:07, 329.94 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:14<00:06, 320.20 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:14<00:06, 320.86 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:15<00:05, 367.61 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:15<00:06, 307.01 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:15<00:05, 356.10 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:16<00:05, 308.50 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:16<00:04, 317.26 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:16<00:04, 325.84 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:17<00:04, 321.27 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:17<00:03, 322.34 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:17<00:03, 333.38 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:18<00:03, 332.34 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:18<00:02, 329.09 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:18<00:02, 326.32 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:19<00:02, 325.50 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:19<00:01, 321.07 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:19<00:01, 328.12 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:20<00:01, 327.30 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:20<00:00, 332.55 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:20<00:00, 349.72 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:20<00:00, 349.88 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:21<00:00, 355.16 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:21<00:00, 260.12 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:04, 996.85 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1365.97 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1552.89 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:00, 1725.44 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1783.36 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1742.70 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1608.29 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1365.12 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2035.36 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2474.82 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2767.30 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:01<00:00, 2870.53 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2496.27 examples/s]
[2026-01-27 09:18:26,387] [WARNING] [py.warnings._showwarnmsg:109] [PID:61967] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4620/5680 [11:29:20<5:22:46, 18.27s/it]                                                                                                                                                                                                                                             {'loss': '0.6061', 'grad_norm': '0.4671', 'learning_rate': '1.673e-05', 'ppl': '1.833', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '168.8', 'tokens/total': 37847040, 'tokens/trainable': 37412360, 'epoch': '6.116'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 4620/5680 [11:29:20<5:22:46, 18.27s/it][2026-01-27 09:18:34,587] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:62206] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 09:18:35,791] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:62206] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:06<05:57, 15.60 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:07<02:40, 34.09 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:07<01:38, 54.59 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:08<01:06, 78.84 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:08<00:51, 100.83 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:09<00:41, 122.88 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:09<00:33, 146.47 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:10<00:30, 156.39 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:10<00:26, 179.45 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:11<00:23, 197.47 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:11<00:22, 197.64 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:12<00:21, 201.20 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:12<00:21, 200.78 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:13<00:20, 207.13 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:13<00:19, 215.49 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:14<00:17, 227.47 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:14<00:16, 231.56 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:14<00:17, 219.17 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:15<00:16, 226.29 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:15<00:16, 221.21 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:16<00:15, 224.73 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:16<00:14, 233.53 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:17<00:15, 209.64 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:17<00:14, 211.79 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:18<00:13, 219.24 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:18<00:12, 234.94 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:19<00:12, 229.53 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:19<00:12, 224.92 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:20<00:11, 219.69 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:20<00:10, 239.91 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:21<00:10, 221.17 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:21<00:10, 228.07 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:21<00:09, 223.81 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:22<00:09, 214.33 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:22<00:08, 223.04 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:23<00:08, 218.49 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:23<00:07, 233.43 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:24<00:07, 211.51 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:24<00:07, 219.46 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:25<00:06, 240.97 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:25<00:06, 218.49 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:26<00:05, 230.16 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:26<00:05, 220.20 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:27<00:04, 221.00 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:27<00:04, 217.83 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:28<00:03, 223.00 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:28<00:03, 242.20 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:29<00:02, 224.23 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:29<00:02, 218.28 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:30<00:02, 207.68 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:30<00:01, 230.28 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:30<00:00, 230.47 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:31<00:00, 209.58 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:31<00:00, 235.90 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:32<00:00, 172.83 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▋                                                                                                                                      | 1000/5677 [00:01<00:04, 980.18 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1340.06 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1495.14 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1656.12 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1763.21 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1788.83 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1604.24 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1374.47 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2044.10 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2408.53 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2692.18 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:02<00:00, 2775.96 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:02<00:00, 2439.45 examples/s]
[2026-01-27 09:19:14,812] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:62206] Using single process for pack_parallel, running sequentially.
[2026-01-27 09:19:20,187] [WARNING] [py.warnings._showwarnmsg:109] [PID:62206] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4621/5680 [11:30:14<8:30:45, 28.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4516', 'grad_norm': '0.382', 'learning_rate': '1.67e-05', 'ppl': '1.571', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 37855232, 'tokens/trainable': 37420536, 'epoch': '7'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4621/5680 [11:30:14<8:30:45, 28.94s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4622/5680 [11:30:22<6:39:04, 22.63s/it]                                                                                                                                                                                                                                             {'loss': '0.6012', 'grad_norm': '0.4064', 'learning_rate': '1.667e-05', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 37863424, 'tokens/trainable': 37428704, 'epoch': '7'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4622/5680 [11:30:22<6:39:04, 22.63s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4623/5680 [11:30:30<5:20:29, 18.19s/it]                                                                                                                                                                                                                                             {'loss': '0.5446', 'grad_norm': '0.4181', 'learning_rate': '1.664e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 37871616, 'tokens/trainable': 37436864, 'epoch': '7.001'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4623/5680 [11:30:30<5:20:29, 18.19s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4624/5680 [11:30:38<4:25:27, 15.08s/it]                                                                                                                                                                                                                                             {'loss': '0.4209', 'grad_norm': '0.3774', 'learning_rate': '1.661e-05', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 37879808, 'tokens/trainable': 37444996, 'epoch': '7.001'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 4624/5680 [11:30:38<4:25:27, 15.08s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4625/5680 [11:30:46<3:47:04, 12.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4671', 'grad_norm': '0.4511', 'learning_rate': '1.658e-05', 'ppl': '1.595', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 37888000, 'tokens/trainable': 37453148, 'epoch': '7.001'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4625/5680 [11:30:46<3:47:04, 12.91s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4626/5680 [11:30:53<3:20:21, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.5449', 'grad_norm': '0.3824', 'learning_rate': '1.655e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 37896192, 'tokens/trainable': 37461300, 'epoch': '7.001'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4626/5680 [11:30:53<3:20:21, 11.41s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4627/5680 [11:31:01<3:01:17, 10.33s/it]                                                                                                                                                                                                                                             {'loss': '0.3834', 'grad_norm': '0.3651', 'learning_rate': '1.652e-05', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 37904384, 'tokens/trainable': 37469476, 'epoch': '7.001'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4627/5680 [11:31:01<3:01:17, 10.33s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4628/5680 [11:31:09<2:48:14,  9.60s/it]                                                                                                                                                                                                                                             {'loss': '0.5929', 'grad_norm': '0.4292', 'learning_rate': '1.649e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 37912576, 'tokens/trainable': 37477652, 'epoch': '7.001'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 4628/5680 [11:31:09<2:48:14,  9.60s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 4629/5680 [11:31:17<2:38:58,  9.08s/it]                                                                                                                                                                                                                                             {'loss': '0.5855', 'grad_norm': '0.4173', 'learning_rate': '1.646e-05', 'ppl': '1.796', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 37920768, 'tokens/trainable': 37485840, 'epoch': '7.002'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 4629/5680 [11:31:17<2:38:58,  9.08s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 4630/5680 [11:31:25<2:32:20,  8.70s/it]                                                                                                                                                                                                                                             {'loss': '0.4751', 'grad_norm': '0.408', 'learning_rate': '1.643e-05', 'ppl': '1.608', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 37928960, 'tokens/trainable': 37494024, 'epoch': '7.002'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 4630/5680 [11:31:25<2:32:20,  8.70s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 4631/5680 [11:31:33<2:27:51,  8.46s/it]                                                                                                                                                                                                                                             {'loss': '0.439', 'grad_norm': '0.388', 'learning_rate': '1.639e-05', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 37937152, 'tokens/trainable': 37502196, 'epoch': '7.002'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 4631/5680 [11:31:33<2:27:51,  8.46s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4632/5680 [11:31:41<2:24:32,  8.28s/it]                                                                                                                                                                                                                                             {'loss': '0.3317', 'grad_norm': '0.3537', 'learning_rate': '1.636e-05', 'ppl': '1.393', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 37945344, 'tokens/trainable': 37510340, 'epoch': '7.002'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4632/5680 [11:31:41<2:24:32,  8.28s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4633/5680 [11:31:48<2:22:20,  8.16s/it]                                                                                                                                                                                                                                             {'loss': '0.4756', 'grad_norm': '0.3859', 'learning_rate': '1.633e-05', 'ppl': '1.609', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 37953536, 'tokens/trainable': 37518524, 'epoch': '7.002'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4633/5680 [11:31:48<2:22:20,  8.16s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4634/5680 [11:31:56<2:20:49,  8.08s/it]                                                                                                                                                                                                                                             {'loss': '0.5376', 'grad_norm': '0.4726', 'learning_rate': '1.63e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 37961728, 'tokens/trainable': 37526696, 'epoch': '7.002'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4634/5680 [11:31:56<2:20:49,  8.08s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4635/5680 [11:32:04<2:19:34,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.6242', 'grad_norm': '0.4434', 'learning_rate': '1.627e-05', 'ppl': '1.867', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 37969920, 'tokens/trainable': 37534860, 'epoch': '7.003'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4635/5680 [11:32:04<2:19:34,  8.01s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4636/5680 [11:32:12<2:18:57,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.4481', 'grad_norm': '0.3701', 'learning_rate': '1.624e-05', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 37978112, 'tokens/trainable': 37543048, 'epoch': '7.003'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4636/5680 [11:32:12<2:18:57,  7.99s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4637/5680 [11:32:20<2:18:12,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2768', 'grad_norm': '0.3989', 'learning_rate': '1.621e-05', 'ppl': '1.319', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 37986304, 'tokens/trainable': 37551208, 'epoch': '7.003'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4637/5680 [11:32:20<2:18:12,  7.95s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4638/5680 [11:32:28<2:17:54,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.5846', 'grad_norm': '0.4057', 'learning_rate': '1.618e-05', 'ppl': '1.794', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 37994496, 'tokens/trainable': 37559380, 'epoch': '7.003'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4638/5680 [11:32:28<2:17:54,  7.94s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4639/5680 [11:32:36<2:17:33,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.5629', 'grad_norm': '0.4271', 'learning_rate': '1.615e-05', 'ppl': '1.756', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38002688, 'tokens/trainable': 37567536, 'epoch': '7.003'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 4639/5680 [11:32:36<2:17:33,  7.93s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 4640/5680 [11:32:44<2:17:14,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3366', 'grad_norm': '0.3845', 'learning_rate': '1.612e-05', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38010880, 'tokens/trainable': 37575688, 'epoch': '7.004'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 4640/5680 [11:32:44<2:17:14,  7.92s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 4641/5680 [11:32:52<2:18:26,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.2639', 'grad_norm': '0.3808', 'learning_rate': '1.609e-05', 'ppl': '1.302', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.2', 'tokens/total': 38019072, 'tokens/trainable': 37583852, 'epoch': '7.004'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 4641/5680 [11:32:52<2:18:26,  7.99s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 4642/5680 [11:33:00<2:17:51,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.475', 'grad_norm': '0.3986', 'learning_rate': '1.606e-05', 'ppl': '1.608', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38027264, 'tokens/trainable': 37592036, 'epoch': '7.004'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 4642/5680 [11:33:00<2:17:51,  7.97s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4643/5680 [11:33:08<2:17:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3898', 'grad_norm': '0.5342', 'learning_rate': '1.603e-05', 'ppl': '1.477', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38035456, 'tokens/trainable': 37600204, 'epoch': '7.004'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4643/5680 [11:33:08<2:17:17,  7.94s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4644/5680 [11:33:16<2:16:43,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4385', 'grad_norm': '0.386', 'learning_rate': '1.6e-05', 'ppl': '1.55', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38043648, 'tokens/trainable': 37608360, 'epoch': '7.004'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4644/5680 [11:33:16<2:16:43,  7.92s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4645/5680 [11:33:23<2:16:14,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6965', 'grad_norm': '0.4102', 'learning_rate': '1.597e-05', 'ppl': '2.007', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38051840, 'tokens/trainable': 37616504, 'epoch': '7.004'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4645/5680 [11:33:23<2:16:14,  7.90s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4646/5680 [11:33:31<2:15:54,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5729', 'grad_norm': '0.4797', 'learning_rate': '1.594e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 38060032, 'tokens/trainable': 37624692, 'epoch': '7.005'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 4646/5680 [11:33:31<2:15:54,  7.89s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4647/5680 [11:33:39<2:15:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3355', 'grad_norm': '0.4042', 'learning_rate': '1.591e-05', 'ppl': '1.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38068224, 'tokens/trainable': 37632864, 'epoch': '7.005'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4647/5680 [11:33:39<2:15:40,  7.88s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4648/5680 [11:33:47<2:15:38,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4008', 'grad_norm': '0.3893', 'learning_rate': '1.588e-05', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 38076416, 'tokens/trainable': 37640996, 'epoch': '7.005'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4648/5680 [11:33:47<2:15:38,  7.89s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4649/5680 [11:33:55<2:15:30,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4439', 'grad_norm': '0.3849', 'learning_rate': '1.585e-05', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38084608, 'tokens/trainable': 37649156, 'epoch': '7.005'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4649/5680 [11:33:55<2:15:30,  7.89s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4650/5680 [11:34:03<2:15:32,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4278', 'grad_norm': '0.4573', 'learning_rate': '1.582e-05', 'ppl': '1.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 38092800, 'tokens/trainable': 37657312, 'epoch': '7.005'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 4650/5680 [11:34:03<2:15:32,  7.90s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4651/5680 [11:34:11<2:15:38,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.57', 'grad_norm': '0.4262', 'learning_rate': '1.579e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 38100992, 'tokens/trainable': 37665492, 'epoch': '7.005'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4651/5680 [11:34:11<2:15:38,  7.91s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4652/5680 [11:34:19<2:15:31,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4788', 'grad_norm': '0.4165', 'learning_rate': '1.576e-05', 'ppl': '1.614', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 38109184, 'tokens/trainable': 37673600, 'epoch': '7.006'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4652/5680 [11:34:19<2:15:31,  7.91s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4653/5680 [11:34:27<2:15:30,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3536', 'grad_norm': '0.357', 'learning_rate': '1.573e-05', 'ppl': '1.424', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 38117376, 'tokens/trainable': 37681784, 'epoch': '7.006'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4653/5680 [11:34:27<2:15:30,  7.92s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4654/5680 [11:34:35<2:15:21,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5643', 'grad_norm': '0.4547', 'learning_rate': '1.57e-05', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 38125568, 'tokens/trainable': 37689888, 'epoch': '7.006'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 4654/5680 [11:34:35<2:15:21,  7.92s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 4655/5680 [11:34:42<2:15:12,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4409', 'grad_norm': '0.3955', 'learning_rate': '1.567e-05', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 38133760, 'tokens/trainable': 37698044, 'epoch': '7.006'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 4655/5680 [11:34:42<2:15:12,  7.91s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 4656/5680 [11:34:50<2:14:45,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3429', 'grad_norm': '0.3956', 'learning_rate': '1.564e-05', 'ppl': '1.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38141952, 'tokens/trainable': 37706196, 'epoch': '7.006'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 4656/5680 [11:34:50<2:14:45,  7.90s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 4657/5680 [11:34:58<2:14:39,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3668', 'grad_norm': '0.4084', 'learning_rate': '1.561e-05', 'ppl': '1.443', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38150144, 'tokens/trainable': 37714368, 'epoch': '7.007'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 4657/5680 [11:34:58<2:14:39,  7.90s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4658/5680 [11:35:06<2:14:12,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6683', 'grad_norm': '0.5355', 'learning_rate': '1.558e-05', 'ppl': '1.951', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 38158336, 'tokens/trainable': 37722536, 'epoch': '7.007'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4658/5680 [11:35:06<2:14:12,  7.88s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4659/5680 [11:35:14<2:14:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5057', 'grad_norm': '0.4444', 'learning_rate': '1.556e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 38166528, 'tokens/trainable': 37730640, 'epoch': '7.007'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4659/5680 [11:35:14<2:14:00,  7.88s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4660/5680 [11:35:22<2:13:55,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5794', 'grad_norm': '0.6151', 'learning_rate': '1.553e-05', 'ppl': '1.785', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38174720, 'tokens/trainable': 37738808, 'epoch': '7.007'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4660/5680 [11:35:22<2:13:55,  7.88s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4661/5680 [11:35:30<2:13:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6313', 'grad_norm': '0.4502', 'learning_rate': '1.55e-05', 'ppl': '1.88', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 38182912, 'tokens/trainable': 37746956, 'epoch': '7.007'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4661/5680 [11:35:30<2:13:29,  7.86s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4662/5680 [11:35:37<2:13:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4243', 'grad_norm': '0.4008', 'learning_rate': '1.547e-05', 'ppl': '1.528', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38191104, 'tokens/trainable': 37755064, 'epoch': '7.007'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4662/5680 [11:35:37<2:13:17,  7.86s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4663/5680 [11:35:45<2:13:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3319', 'grad_norm': '0.3959', 'learning_rate': '1.544e-05', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38199296, 'tokens/trainable': 37763252, 'epoch': '7.008'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4663/5680 [11:35:45<2:13:26,  7.87s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4664/5680 [11:35:53<2:13:32,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.816', 'grad_norm': '0.4521', 'learning_rate': '1.541e-05', 'ppl': '2.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 38207488, 'tokens/trainable': 37771412, 'epoch': '7.008'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4664/5680 [11:35:53<2:13:32,  7.89s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4665/5680 [11:36:01<2:13:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4088', 'grad_norm': '0.3573', 'learning_rate': '1.538e-05', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38215680, 'tokens/trainable': 37779600, 'epoch': '7.008'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4665/5680 [11:36:01<2:13:21,  7.88s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 4666/5680 [11:36:09<2:13:25,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.7408', 'grad_norm': '0.6214', 'learning_rate': '1.535e-05', 'ppl': '2.098', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 38223872, 'tokens/trainable': 37787708, 'epoch': '7.008'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 4666/5680 [11:36:09<2:13:25,  7.90s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 4667/5680 [11:36:17<2:13:10,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5511', 'grad_norm': '0.4293', 'learning_rate': '1.532e-05', 'ppl': '1.735', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38232064, 'tokens/trainable': 37795860, 'epoch': '7.008'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 4667/5680 [11:36:17<2:13:10,  7.89s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 4668/5680 [11:36:25<2:12:58,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3122', 'grad_norm': '0.3888', 'learning_rate': '1.529e-05', 'ppl': '1.366', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38240256, 'tokens/trainable': 37804032, 'epoch': '7.008'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 4668/5680 [11:36:25<2:12:58,  7.88s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4669/5680 [11:36:33<2:13:08,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.2844', 'grad_norm': '0.3965', 'learning_rate': '1.526e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 38248448, 'tokens/trainable': 37812196, 'epoch': '7.009'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4669/5680 [11:36:33<2:13:08,  7.90s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4670/5680 [11:36:41<2:12:52,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.333', 'grad_norm': '0.4156', 'learning_rate': '1.523e-05', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38256640, 'tokens/trainable': 37820384, 'epoch': '7.009'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4670/5680 [11:36:41<2:12:52,  7.89s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4671/5680 [11:36:48<2:12:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4236', 'grad_norm': '0.3786', 'learning_rate': '1.52e-05', 'ppl': '1.527', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 38264832, 'tokens/trainable': 37828568, 'epoch': '7.009'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4671/5680 [11:36:48<2:12:34,  7.88s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4672/5680 [11:36:56<2:12:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6073', 'grad_norm': '0.4497', 'learning_rate': '1.517e-05', 'ppl': '1.836', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38273024, 'tokens/trainable': 37836684, 'epoch': '7.009'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4672/5680 [11:36:56<2:12:17,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4673/5680 [11:37:04<2:12:01,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5283', 'grad_norm': '0.4537', 'learning_rate': '1.514e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38281216, 'tokens/trainable': 37844816, 'epoch': '7.009'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4673/5680 [11:37:04<2:12:01,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4674/5680 [11:37:12<2:11:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3429', 'grad_norm': '0.4252', 'learning_rate': '1.511e-05', 'ppl': '1.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38289408, 'tokens/trainable': 37852992, 'epoch': '7.01'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4674/5680 [11:37:12<2:11:55,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4675/5680 [11:37:20<2:11:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4578', 'grad_norm': '0.4322', 'learning_rate': '1.508e-05', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38297600, 'tokens/trainable': 37861164, 'epoch': '7.01'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4675/5680 [11:37:20<2:11:48,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4676/5680 [11:37:28<2:11:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3357', 'grad_norm': '0.433', 'learning_rate': '1.506e-05', 'ppl': '1.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 38305792, 'tokens/trainable': 37869292, 'epoch': '7.01'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 4676/5680 [11:37:28<2:11:49,  7.88s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4677/5680 [11:37:36<2:11:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4146', 'grad_norm': '0.3894', 'learning_rate': '1.503e-05', 'ppl': '1.514', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 38313984, 'tokens/trainable': 37877464, 'epoch': '7.01'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4677/5680 [11:37:36<2:11:26,  7.86s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4678/5680 [11:37:43<2:11:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4981', 'grad_norm': '0.4072', 'learning_rate': '1.5e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 38322176, 'tokens/trainable': 37885628, 'epoch': '7.01'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4678/5680 [11:37:43<2:11:06,  7.85s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4679/5680 [11:37:51<2:11:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4004', 'grad_norm': '0.3994', 'learning_rate': '1.497e-05', 'ppl': '1.492', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38330368, 'tokens/trainable': 37893796, 'epoch': '7.01'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4679/5680 [11:37:51<2:11:08,  7.86s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4680/5680 [11:37:59<2:11:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5614', 'grad_norm': '0.4413', 'learning_rate': '1.494e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 38338560, 'tokens/trainable': 37901928, 'epoch': '7.011'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 4680/5680 [11:37:59<2:11:12,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4681/5680 [11:38:07<2:11:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5863', 'grad_norm': '0.4002', 'learning_rate': '1.491e-05', 'ppl': '1.797', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38346752, 'tokens/trainable': 37910108, 'epoch': '7.011'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4681/5680 [11:38:07<2:11:02,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4682/5680 [11:38:15<2:10:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.465', 'grad_norm': '0.4409', 'learning_rate': '1.488e-05', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 38354944, 'tokens/trainable': 37918220, 'epoch': '7.011'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4682/5680 [11:38:15<2:10:53,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4683/5680 [11:38:23<2:10:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6585', 'grad_norm': '0.5152', 'learning_rate': '1.485e-05', 'ppl': '1.932', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38363136, 'tokens/trainable': 37926356, 'epoch': '7.011'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4683/5680 [11:38:23<2:10:43,  7.87s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4684/5680 [11:38:31<2:10:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4634', 'grad_norm': '0.4038', 'learning_rate': '1.482e-05', 'ppl': '1.589', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38371328, 'tokens/trainable': 37934524, 'epoch': '7.011'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4684/5680 [11:38:31<2:10:31,  7.86s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4685/5680 [11:38:39<2:10:01,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6836', 'grad_norm': '0.4581', 'learning_rate': '1.479e-05', 'ppl': '1.981', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1051', 'tokens/total': 38379520, 'tokens/trainable': 37942704, 'epoch': '7.011'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4685/5680 [11:38:39<2:10:01,  7.84s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4686/5680 [11:38:46<2:09:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3858', 'grad_norm': '0.4485', 'learning_rate': '1.476e-05', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38387712, 'tokens/trainable': 37950856, 'epoch': '7.012'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4686/5680 [11:38:46<2:09:59,  7.85s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4687/5680 [11:38:54<2:09:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5289', 'grad_norm': '0.4585', 'learning_rate': '1.474e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38395904, 'tokens/trainable': 37959008, 'epoch': '7.012'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 4687/5680 [11:38:54<2:09:57,  7.85s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4688/5680 [11:39:02<2:10:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3972', 'grad_norm': '0.3993', 'learning_rate': '1.471e-05', 'ppl': '1.488', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38404096, 'tokens/trainable': 37967184, 'epoch': '7.012'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4688/5680 [11:39:02<2:10:08,  7.87s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4689/5680 [11:39:10<2:10:14,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3162', 'grad_norm': '0.5057', 'learning_rate': '1.468e-05', 'ppl': '1.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 38412288, 'tokens/trainable': 37975276, 'epoch': '7.012'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4689/5680 [11:39:10<2:10:14,  7.89s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4690/5680 [11:39:18<2:10:12,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3508', 'grad_norm': '0.3923', 'learning_rate': '1.465e-05', 'ppl': '1.42', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 38420480, 'tokens/trainable': 37983424, 'epoch': '7.012'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4690/5680 [11:39:18<2:10:12,  7.89s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4691/5680 [11:39:26<2:09:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.45', 'grad_norm': '0.4073', 'learning_rate': '1.462e-05', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 38428672, 'tokens/trainable': 37991600, 'epoch': '7.013'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 4691/5680 [11:39:26<2:09:43,  7.87s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 4692/5680 [11:39:34<2:09:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5673', 'grad_norm': '0.4712', 'learning_rate': '1.459e-05', 'ppl': '1.764', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38436864, 'tokens/trainable': 37999748, 'epoch': '7.013'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 4692/5680 [11:39:34<2:09:30,  7.86s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 4693/5680 [11:39:41<2:09:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5957', 'grad_norm': '0.4366', 'learning_rate': '1.456e-05', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38445056, 'tokens/trainable': 38007864, 'epoch': '7.013'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 4693/5680 [11:39:41<2:09:11,  7.85s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 4694/5680 [11:39:49<2:09:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2454', 'grad_norm': '0.4276', 'learning_rate': '1.453e-05', 'ppl': '1.278', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38453248, 'tokens/trainable': 38016048, 'epoch': '7.013'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 4694/5680 [11:39:49<2:09:12,  7.86s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4695/5680 [11:39:57<2:09:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3496', 'grad_norm': '0.4313', 'learning_rate': '1.451e-05', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38461440, 'tokens/trainable': 38024184, 'epoch': '7.013'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4695/5680 [11:39:57<2:09:03,  7.86s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4696/5680 [11:40:05<2:08:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4022', 'grad_norm': '0.4174', 'learning_rate': '1.448e-05', 'ppl': '1.495', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 38469632, 'tokens/trainable': 38032368, 'epoch': '7.013'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4696/5680 [11:40:05<2:08:49,  7.86s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4697/5680 [11:40:13<2:08:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3077', 'grad_norm': '0.397', 'learning_rate': '1.445e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 38477824, 'tokens/trainable': 38040464, 'epoch': '7.014'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4697/5680 [11:40:13<2:08:48,  7.86s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4698/5680 [11:40:21<2:08:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7144', 'grad_norm': '0.6319', 'learning_rate': '1.442e-05', 'ppl': '2.043', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38486016, 'tokens/trainable': 38048592, 'epoch': '7.014'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 4698/5680 [11:40:21<2:08:40,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4699/5680 [11:40:29<2:08:45,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.212', 'grad_norm': '0.3027', 'learning_rate': '1.439e-05', 'ppl': '1.236', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38494208, 'tokens/trainable': 38056768, 'epoch': '7.014'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4699/5680 [11:40:29<2:08:45,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4700/5680 [11:40:37<2:08:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6374', 'grad_norm': '0.4123', 'learning_rate': '1.436e-05', 'ppl': '1.891', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38502400, 'tokens/trainable': 38064916, 'epoch': '7.014'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4700/5680 [11:40:37<2:08:30,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4701/5680 [11:40:44<2:08:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6329', 'grad_norm': '0.4424', 'learning_rate': '1.433e-05', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38510592, 'tokens/trainable': 38073060, 'epoch': '7.014'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4701/5680 [11:40:44<2:08:14,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4702/5680 [11:40:52<2:08:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3085', 'grad_norm': '0.3109', 'learning_rate': '1.431e-05', 'ppl': '1.361', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38518784, 'tokens/trainable': 38081248, 'epoch': '7.014'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 4702/5680 [11:40:52<2:08:11,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4703/5680 [11:41:00<2:08:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4628', 'grad_norm': '0.4032', 'learning_rate': '1.428e-05', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38526976, 'tokens/trainable': 38089416, 'epoch': '7.015'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4703/5680 [11:41:00<2:08:00,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4704/5680 [11:41:08<2:08:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4114', 'grad_norm': '0.446', 'learning_rate': '1.425e-05', 'ppl': '1.509', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38535168, 'tokens/trainable': 38097596, 'epoch': '7.015'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4704/5680 [11:41:08<2:08:08,  7.88s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4705/5680 [11:41:16<2:07:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3383', 'grad_norm': '0.3887', 'learning_rate': '1.422e-05', 'ppl': '1.403', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38543360, 'tokens/trainable': 38105752, 'epoch': '7.015'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4705/5680 [11:41:16<2:07:52,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4706/5680 [11:41:24<2:07:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3746', 'grad_norm': '0.388', 'learning_rate': '1.419e-05', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 38551552, 'tokens/trainable': 38113920, 'epoch': '7.015'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 4706/5680 [11:41:24<2:07:39,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4707/5680 [11:41:32<2:07:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4955', 'grad_norm': '0.4278', 'learning_rate': '1.416e-05', 'ppl': '1.641', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38559744, 'tokens/trainable': 38122100, 'epoch': '7.015'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4707/5680 [11:41:32<2:07:32,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4708/5680 [11:41:39<2:07:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4375', 'grad_norm': '0.3903', 'learning_rate': '1.413e-05', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 38567936, 'tokens/trainable': 38130292, 'epoch': '7.015'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4708/5680 [11:41:39<2:07:21,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4709/5680 [11:41:47<2:07:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4155', 'grad_norm': '0.4345', 'learning_rate': '1.411e-05', 'ppl': '1.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 38576128, 'tokens/trainable': 38138480, 'epoch': '7.016'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4709/5680 [11:41:47<2:07:09,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4710/5680 [11:41:55<2:07:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2864', 'grad_norm': '0.3975', 'learning_rate': '1.408e-05', 'ppl': '1.332', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 38584320, 'tokens/trainable': 38146604, 'epoch': '7.016'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4710/5680 [11:41:55<2:07:15,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4711/5680 [11:42:03<2:07:34,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6475', 'grad_norm': '0.4388', 'learning_rate': '1.405e-05', 'ppl': '1.911', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 38592512, 'tokens/trainable': 38154788, 'epoch': '7.016'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4711/5680 [11:42:03<2:07:34,  7.90s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4712/5680 [11:42:11<2:07:36,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3341', 'grad_norm': '0.3314', 'learning_rate': '1.402e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 38600704, 'tokens/trainable': 38162968, 'epoch': '7.016'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4712/5680 [11:42:11<2:07:36,  7.91s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4713/5680 [11:42:19<2:08:59,  8.00s/it]                                                                                                                                                                                                                                             {'loss': '0.3636', 'grad_norm': '0.3704', 'learning_rate': '1.399e-05', 'ppl': '1.439', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.1', 'tokens/total': 38608896, 'tokens/trainable': 38171096, 'epoch': '7.016'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4713/5680 [11:42:19<2:08:59,  8.00s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4714/5680 [11:42:27<2:08:18,  7.97s/it]                                                                                                                                                                                                                                             {'loss': '0.5665', 'grad_norm': '0.5167', 'learning_rate': '1.397e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38617088, 'tokens/trainable': 38179240, 'epoch': '7.017'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4714/5680 [11:42:27<2:08:18,  7.97s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4715/5680 [11:42:35<2:07:49,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.3456', 'grad_norm': '0.3959', 'learning_rate': '1.394e-05', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 38625280, 'tokens/trainable': 38187308, 'epoch': '7.017'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4715/5680 [11:42:35<2:07:49,  7.95s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4716/5680 [11:42:43<2:07:22,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.6342', 'grad_norm': '0.4716', 'learning_rate': '1.391e-05', 'ppl': '1.886', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38633472, 'tokens/trainable': 38195488, 'epoch': '7.017'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4716/5680 [11:42:43<2:07:22,  7.93s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4717/5680 [11:42:51<2:06:52,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6673', 'grad_norm': '0.4566', 'learning_rate': '1.388e-05', 'ppl': '1.949', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38641664, 'tokens/trainable': 38203636, 'epoch': '7.017'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 4717/5680 [11:42:51<2:06:52,  7.90s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4718/5680 [11:42:59<2:06:37,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3368', 'grad_norm': '0.4254', 'learning_rate': '1.385e-05', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38649856, 'tokens/trainable': 38211800, 'epoch': '7.017'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4718/5680 [11:42:59<2:06:37,  7.90s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4719/5680 [11:43:07<2:06:19,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5835', 'grad_norm': '0.409', 'learning_rate': '1.382e-05', 'ppl': '1.792', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38658048, 'tokens/trainable': 38219960, 'epoch': '7.017'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4719/5680 [11:43:07<2:06:19,  7.89s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4720/5680 [11:43:14<2:06:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4463', 'grad_norm': '0.4788', 'learning_rate': '1.38e-05', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38666240, 'tokens/trainable': 38228128, 'epoch': '7.018'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4720/5680 [11:43:14<2:06:05,  7.88s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4721/5680 [11:43:22<2:05:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4591', 'grad_norm': '0.5756', 'learning_rate': '1.377e-05', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 38674432, 'tokens/trainable': 38236304, 'epoch': '7.018'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4721/5680 [11:43:22<2:05:50,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4722/5680 [11:43:30<2:05:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2721', 'grad_norm': '0.3793', 'learning_rate': '1.374e-05', 'ppl': '1.313', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 38682624, 'tokens/trainable': 38244432, 'epoch': '7.018'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4722/5680 [11:43:30<2:05:43,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4723/5680 [11:43:38<2:05:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4019', 'grad_norm': '0.4289', 'learning_rate': '1.371e-05', 'ppl': '1.495', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38690816, 'tokens/trainable': 38252568, 'epoch': '7.018'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4723/5680 [11:43:38<2:05:37,  7.88s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4724/5680 [11:43:46<2:05:32,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4466', 'grad_norm': '0.4757', 'learning_rate': '1.368e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38699008, 'tokens/trainable': 38260744, 'epoch': '7.018'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 4724/5680 [11:43:46<2:05:32,  7.88s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4725/5680 [11:43:54<2:05:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.492', 'grad_norm': '0.4894', 'learning_rate': '1.366e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38707200, 'tokens/trainable': 38268896, 'epoch': '7.018'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4725/5680 [11:43:54<2:05:15,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4726/5680 [11:44:02<2:05:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6464', 'grad_norm': '0.4538', 'learning_rate': '1.363e-05', 'ppl': '1.909', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 38715392, 'tokens/trainable': 38277000, 'epoch': '7.019'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4726/5680 [11:44:02<2:05:06,  7.87s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4727/5680 [11:44:10<2:04:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4475', 'grad_norm': '0.4442', 'learning_rate': '1.36e-05', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 38723584, 'tokens/trainable': 38285152, 'epoch': '7.019'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4727/5680 [11:44:10<2:04:48,  7.86s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4728/5680 [11:44:17<2:04:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6179', 'grad_norm': '0.4469', 'learning_rate': '1.357e-05', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38731776, 'tokens/trainable': 38293328, 'epoch': '7.019'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 4728/5680 [11:44:17<2:04:50,  7.87s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4729/5680 [11:44:25<2:04:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3526', 'grad_norm': '0.346', 'learning_rate': '1.355e-05', 'ppl': '1.423', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38739968, 'tokens/trainable': 38301460, 'epoch': '7.019'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4729/5680 [11:44:25<2:04:34,  7.86s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4730/5680 [11:44:33<2:04:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5426', 'grad_norm': '0.4731', 'learning_rate': '1.352e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38748160, 'tokens/trainable': 38309576, 'epoch': '7.019'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4730/5680 [11:44:33<2:04:19,  7.85s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4731/5680 [11:44:41<2:04:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3614', 'grad_norm': '0.4205', 'learning_rate': '1.349e-05', 'ppl': '1.435', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38756352, 'tokens/trainable': 38317740, 'epoch': '7.02'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4731/5680 [11:44:41<2:04:25,  7.87s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4732/5680 [11:44:49<2:04:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2479', 'grad_norm': '0.373', 'learning_rate': '1.346e-05', 'ppl': '1.281', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38764544, 'tokens/trainable': 38325916, 'epoch': '7.02'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 4732/5680 [11:44:49<2:04:23,  7.87s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 4733/5680 [11:44:57<2:04:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5411', 'grad_norm': '0.4693', 'learning_rate': '1.343e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 38772736, 'tokens/trainable': 38334032, 'epoch': '7.02'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 4733/5680 [11:44:57<2:04:14,  7.87s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 4734/5680 [11:45:05<2:04:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4344', 'grad_norm': '0.3643', 'learning_rate': '1.341e-05', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38780928, 'tokens/trainable': 38342212, 'epoch': '7.02'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 4734/5680 [11:45:05<2:04:05,  7.87s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 4735/5680 [11:45:12<2:03:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.269', 'grad_norm': '0.432', 'learning_rate': '1.338e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 38789120, 'tokens/trainable': 38350380, 'epoch': '7.02'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 4735/5680 [11:45:12<2:03:52,  7.86s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4736/5680 [11:45:20<2:03:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3256', 'grad_norm': '0.4148', 'learning_rate': '1.335e-05', 'ppl': '1.385', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38797312, 'tokens/trainable': 38358512, 'epoch': '7.02'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4736/5680 [11:45:20<2:03:41,  7.86s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4737/5680 [11:45:28<2:03:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3094', 'grad_norm': '0.3465', 'learning_rate': '1.332e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38805504, 'tokens/trainable': 38366672, 'epoch': '7.021'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4737/5680 [11:45:28<2:03:33,  7.86s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4738/5680 [11:45:36<2:03:13,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2535', 'grad_norm': '0.4053', 'learning_rate': '1.33e-05', 'ppl': '1.288', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38813696, 'tokens/trainable': 38374804, 'epoch': '7.021'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4738/5680 [11:45:36<2:03:13,  7.85s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4739/5680 [11:45:44<2:03:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2687', 'grad_norm': '0.4482', 'learning_rate': '1.327e-05', 'ppl': '1.308', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 38821888, 'tokens/trainable': 38382952, 'epoch': '7.021'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4739/5680 [11:45:44<2:03:06,  7.85s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4740/5680 [11:45:52<2:03:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4668', 'grad_norm': '0.3795', 'learning_rate': '1.324e-05', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38830080, 'tokens/trainable': 38391112, 'epoch': '7.021'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4740/5680 [11:45:52<2:03:03,  7.85s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4741/5680 [11:46:00<2:04:08,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3589', 'grad_norm': '0.3646', 'learning_rate': '1.321e-05', 'ppl': '1.432', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 38838272, 'tokens/trainable': 38399276, 'epoch': '7.021'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4741/5680 [11:46:00<2:04:08,  7.93s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4742/5680 [11:46:08<2:03:40,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4735', 'grad_norm': '0.4232', 'learning_rate': '1.319e-05', 'ppl': '1.606', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 38846464, 'tokens/trainable': 38407388, 'epoch': '7.021'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4742/5680 [11:46:08<2:03:40,  7.91s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4743/5680 [11:46:16<2:03:26,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.6272', 'grad_norm': '0.4178', 'learning_rate': '1.316e-05', 'ppl': '1.872', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38854656, 'tokens/trainable': 38415572, 'epoch': '7.022'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 4743/5680 [11:46:16<2:03:26,  7.90s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 4744/5680 [11:46:23<2:03:11,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4985', 'grad_norm': '0.4006', 'learning_rate': '1.313e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 38862848, 'tokens/trainable': 38423700, 'epoch': '7.022'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 4744/5680 [11:46:23<2:03:11,  7.90s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 4745/5680 [11:46:31<2:02:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.514', 'grad_norm': '0.4587', 'learning_rate': '1.31e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 38871040, 'tokens/trainable': 38431868, 'epoch': '7.022'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 4745/5680 [11:46:31<2:02:40,  7.87s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 4746/5680 [11:46:39<2:02:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4675', 'grad_norm': '0.393', 'learning_rate': '1.308e-05', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 38879232, 'tokens/trainable': 38440048, 'epoch': '7.022'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 4746/5680 [11:46:39<2:02:40,  7.88s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4747/5680 [11:46:47<2:02:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5696', 'grad_norm': '0.5012', 'learning_rate': '1.305e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38887424, 'tokens/trainable': 38448176, 'epoch': '7.022'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4747/5680 [11:46:47<2:02:27,  7.87s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4748/5680 [11:46:55<2:02:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4157', 'grad_norm': '0.3796', 'learning_rate': '1.302e-05', 'ppl': '1.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38895616, 'tokens/trainable': 38456328, 'epoch': '7.023'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4748/5680 [11:46:55<2:02:24,  7.88s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4749/5680 [11:47:03<2:02:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4893', 'grad_norm': '0.4936', 'learning_rate': '1.299e-05', 'ppl': '1.631', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 38903808, 'tokens/trainable': 38464476, 'epoch': '7.023'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4749/5680 [11:47:03<2:02:11,  7.87s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4750/5680 [11:47:11<2:02:06,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3609', 'grad_norm': '0.4129', 'learning_rate': '1.297e-05', 'ppl': '1.435', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38912000, 'tokens/trainable': 38472660, 'epoch': '7.023'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4750/5680 [11:47:11<2:02:06,  7.88s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4751/5680 [11:47:19<2:01:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5072', 'grad_norm': '0.4938', 'learning_rate': '1.294e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 38920192, 'tokens/trainable': 38480840, 'epoch': '7.023'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4751/5680 [11:47:19<2:01:53,  7.87s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4752/5680 [11:47:26<2:01:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3583', 'grad_norm': '0.4167', 'learning_rate': '1.291e-05', 'ppl': '1.431', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 38928384, 'tokens/trainable': 38489004, 'epoch': '7.023'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4752/5680 [11:47:26<2:01:39,  7.87s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4753/5680 [11:47:34<2:01:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.349', 'grad_norm': '0.4361', 'learning_rate': '1.289e-05', 'ppl': '1.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 38936576, 'tokens/trainable': 38497184, 'epoch': '7.023'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4753/5680 [11:47:34<2:01:24,  7.86s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4754/5680 [11:47:42<2:01:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3676', 'grad_norm': '0.4309', 'learning_rate': '1.286e-05', 'ppl': '1.444', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38944768, 'tokens/trainable': 38505340, 'epoch': '7.024'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4754/5680 [11:47:42<2:01:27,  7.87s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4755/5680 [11:47:50<2:01:27,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3324', 'grad_norm': '0.405', 'learning_rate': '1.283e-05', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 38952960, 'tokens/trainable': 38513500, 'epoch': '7.024'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4755/5680 [11:47:50<2:01:27,  7.88s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4756/5680 [11:47:58<2:01:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3392', 'grad_norm': '0.508', 'learning_rate': '1.28e-05', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 38961152, 'tokens/trainable': 38521636, 'epoch': '7.024'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4756/5680 [11:47:58<2:01:18,  7.88s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4757/5680 [11:48:06<2:01:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3032', 'grad_norm': '0.3657', 'learning_rate': '1.278e-05', 'ppl': '1.354', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 38969344, 'tokens/trainable': 38529812, 'epoch': '7.024'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4757/5680 [11:48:06<2:01:13,  7.88s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4758/5680 [11:48:14<2:01:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2698', 'grad_norm': '0.3633', 'learning_rate': '1.275e-05', 'ppl': '1.31', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 38977536, 'tokens/trainable': 38538000, 'epoch': '7.024'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 4758/5680 [11:48:14<2:01:08,  7.88s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4759/5680 [11:48:22<2:00:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.553', 'grad_norm': '0.4769', 'learning_rate': '1.272e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 38985728, 'tokens/trainable': 38546184, 'epoch': '7.024'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4759/5680 [11:48:22<2:00:53,  7.88s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4760/5680 [11:48:29<2:00:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3424', 'grad_norm': '0.3562', 'learning_rate': '1.27e-05', 'ppl': '1.408', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 38993920, 'tokens/trainable': 38554368, 'epoch': '7.025'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4760/5680 [11:48:29<2:00:30,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4761/5680 [11:48:37<2:00:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3098', 'grad_norm': '0.3763', 'learning_rate': '1.267e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39002112, 'tokens/trainable': 38562532, 'epoch': '7.025'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4761/5680 [11:48:37<2:00:23,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4762/5680 [11:48:45<2:00:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.308', 'grad_norm': '0.387', 'learning_rate': '1.264e-05', 'ppl': '1.361', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39010304, 'tokens/trainable': 38570664, 'epoch': '7.025'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4762/5680 [11:48:45<2:00:18,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4763/5680 [11:48:53<2:00:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3749', 'grad_norm': '0.3518', 'learning_rate': '1.262e-05', 'ppl': '1.455', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39018496, 'tokens/trainable': 38578848, 'epoch': '7.025'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4763/5680 [11:48:53<2:00:19,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4764/5680 [11:49:01<2:00:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2551', 'grad_norm': '0.4231', 'learning_rate': '1.259e-05', 'ppl': '1.291', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39026688, 'tokens/trainable': 38587016, 'epoch': '7.025'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4764/5680 [11:49:01<2:00:13,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4765/5680 [11:49:09<1:59:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4468', 'grad_norm': '0.4084', 'learning_rate': '1.256e-05', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39034880, 'tokens/trainable': 38595164, 'epoch': '7.026'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 4765/5680 [11:49:09<1:59:53,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4766/5680 [11:49:17<1:59:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4271', 'grad_norm': '0.457', 'learning_rate': '1.254e-05', 'ppl': '1.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 39043072, 'tokens/trainable': 38603272, 'epoch': '7.026'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4766/5680 [11:49:17<1:59:51,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4767/5680 [11:49:24<1:59:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3045', 'grad_norm': '0.3882', 'learning_rate': '1.251e-05', 'ppl': '1.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39051264, 'tokens/trainable': 38611436, 'epoch': '7.026'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4767/5680 [11:49:24<1:59:47,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4768/5680 [11:49:32<1:59:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4796', 'grad_norm': '0.4094', 'learning_rate': '1.248e-05', 'ppl': '1.615', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39059456, 'tokens/trainable': 38619620, 'epoch': '7.026'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4768/5680 [11:49:32<1:59:39,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4769/5680 [11:49:40<1:59:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3602', 'grad_norm': '0.4603', 'learning_rate': '1.245e-05', 'ppl': '1.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39067648, 'tokens/trainable': 38627784, 'epoch': '7.026'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 4769/5680 [11:49:40<1:59:27,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 4770/5680 [11:49:48<1:59:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6933', 'grad_norm': '0.4633', 'learning_rate': '1.243e-05', 'ppl': '2', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39075840, 'tokens/trainable': 38635928, 'epoch': '7.026'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 4770/5680 [11:49:48<1:59:15,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 4771/5680 [11:49:56<1:59:05,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3913', 'grad_norm': '0.4328', 'learning_rate': '1.24e-05', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39084032, 'tokens/trainable': 38644092, 'epoch': '7.027'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 4771/5680 [11:49:56<1:59:05,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 4772/5680 [11:50:04<1:59:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5796', 'grad_norm': '0.4136', 'learning_rate': '1.237e-05', 'ppl': '1.785', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 39092224, 'tokens/trainable': 38652212, 'epoch': '7.027'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 4772/5680 [11:50:04<1:59:02,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4773/5680 [11:50:12<1:58:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4004', 'grad_norm': '0.4654', 'learning_rate': '1.235e-05', 'ppl': '1.492', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39100416, 'tokens/trainable': 38660352, 'epoch': '7.027'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4773/5680 [11:50:12<1:58:52,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4774/5680 [11:50:19<1:58:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4597', 'grad_norm': '0.4205', 'learning_rate': '1.232e-05', 'ppl': '1.584', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39108608, 'tokens/trainable': 38668472, 'epoch': '7.027'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4774/5680 [11:50:19<1:58:37,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4775/5680 [11:50:27<1:58:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4901', 'grad_norm': '0.427', 'learning_rate': '1.229e-05', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39116800, 'tokens/trainable': 38676624, 'epoch': '7.027'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4775/5680 [11:50:27<1:58:36,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4776/5680 [11:50:35<1:58:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4732', 'grad_norm': '0.4505', 'learning_rate': '1.227e-05', 'ppl': '1.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39124992, 'tokens/trainable': 38684776, 'epoch': '7.027'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 4776/5680 [11:50:35<1:58:28,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4777/5680 [11:50:43<1:58:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2869', 'grad_norm': '0.4912', 'learning_rate': '1.224e-05', 'ppl': '1.332', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 39133184, 'tokens/trainable': 38692896, 'epoch': '7.028'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4777/5680 [11:50:43<1:58:35,  7.88s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4778/5680 [11:50:51<1:58:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3516', 'grad_norm': '0.348', 'learning_rate': '1.222e-05', 'ppl': '1.421', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39141376, 'tokens/trainable': 38701008, 'epoch': '7.028'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4778/5680 [11:50:51<1:58:19,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4779/5680 [11:50:59<1:58:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6979', 'grad_norm': '0.4274', 'learning_rate': '1.219e-05', 'ppl': '2.009', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39149568, 'tokens/trainable': 38709156, 'epoch': '7.028'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4779/5680 [11:50:59<1:58:04,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4780/5680 [11:51:07<1:57:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4785', 'grad_norm': '0.3904', 'learning_rate': '1.216e-05', 'ppl': '1.614', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39157760, 'tokens/trainable': 38717280, 'epoch': '7.028'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 4780/5680 [11:51:07<1:57:54,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4781/5680 [11:51:15<1:57:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.527', 'grad_norm': '0.4075', 'learning_rate': '1.214e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 39165952, 'tokens/trainable': 38725464, 'epoch': '7.028'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4781/5680 [11:51:15<1:57:44,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4782/5680 [11:51:22<1:57:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4499', 'grad_norm': '0.4136', 'learning_rate': '1.211e-05', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39174144, 'tokens/trainable': 38733612, 'epoch': '7.029'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4782/5680 [11:51:22<1:57:41,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4783/5680 [11:51:30<1:57:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4462', 'grad_norm': '0.5137', 'learning_rate': '1.208e-05', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39182336, 'tokens/trainable': 38741772, 'epoch': '7.029'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4783/5680 [11:51:30<1:57:37,  7.87s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4784/5680 [11:51:38<1:57:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5506', 'grad_norm': '0.4294', 'learning_rate': '1.206e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39190528, 'tokens/trainable': 38749888, 'epoch': '7.029'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 4784/5680 [11:51:38<1:57:19,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 4785/5680 [11:51:46<1:57:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4974', 'grad_norm': '0.4526', 'learning_rate': '1.203e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39198720, 'tokens/trainable': 38758028, 'epoch': '7.029'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 4785/5680 [11:51:46<1:57:10,  7.86s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 4786/5680 [11:51:54<1:56:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4484', 'grad_norm': '0.4347', 'learning_rate': '1.2e-05', 'ppl': '1.566', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39206912, 'tokens/trainable': 38766176, 'epoch': '7.029'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 4786/5680 [11:51:54<1:56:59,  7.85s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 4787/5680 [11:52:02<1:56:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.633', 'grad_norm': '0.4477', 'learning_rate': '1.198e-05', 'ppl': '1.883', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39215104, 'tokens/trainable': 38774304, 'epoch': '7.029'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 4787/5680 [11:52:02<1:56:55,  7.86s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4788/5680 [11:52:10<1:56:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4576', 'grad_norm': '0.3839', 'learning_rate': '1.195e-05', 'ppl': '1.58', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 39223296, 'tokens/trainable': 38782460, 'epoch': '7.03'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4788/5680 [11:52:10<1:56:57,  7.87s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4789/5680 [11:52:17<1:56:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4737', 'grad_norm': '0.4407', 'learning_rate': '1.193e-05', 'ppl': '1.606', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 39231488, 'tokens/trainable': 38790636, 'epoch': '7.03'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4789/5680 [11:52:17<1:56:45,  7.86s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4790/5680 [11:52:25<1:56:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2834', 'grad_norm': '0.3638', 'learning_rate': '1.19e-05', 'ppl': '1.328', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 39239680, 'tokens/trainable': 38798816, 'epoch': '7.03'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4790/5680 [11:52:25<1:56:42,  7.87s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4791/5680 [11:52:33<1:56:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.512', 'grad_norm': '0.4598', 'learning_rate': '1.187e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39247872, 'tokens/trainable': 38806976, 'epoch': '7.03'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4791/5680 [11:52:33<1:56:28,  7.86s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4792/5680 [11:52:41<1:56:11,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4444', 'grad_norm': '0.425', 'learning_rate': '1.185e-05', 'ppl': '1.56', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 39256064, 'tokens/trainable': 38815152, 'epoch': '7.03'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4792/5680 [11:52:41<1:56:11,  7.85s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4793/5680 [11:52:49<1:56:05,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3393', 'grad_norm': '0.4269', 'learning_rate': '1.182e-05', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 39264256, 'tokens/trainable': 38823272, 'epoch': '7.03'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4793/5680 [11:52:49<1:56:05,  7.85s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4794/5680 [11:52:57<1:55:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3522', 'grad_norm': '0.3914', 'learning_rate': '1.179e-05', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 39272448, 'tokens/trainable': 38831424, 'epoch': '7.031'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4794/5680 [11:52:57<1:55:58,  7.85s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4795/5680 [11:53:05<1:55:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4048', 'grad_norm': '0.438', 'learning_rate': '1.177e-05', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 39280640, 'tokens/trainable': 38839508, 'epoch': '7.031'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 4795/5680 [11:53:05<1:55:51,  7.85s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4796/5680 [11:53:12<1:55:41,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4036', 'grad_norm': '0.4355', 'learning_rate': '1.174e-05', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 39288832, 'tokens/trainable': 38847692, 'epoch': '7.031'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4796/5680 [11:53:12<1:55:41,  7.85s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4797/5680 [11:53:20<1:55:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5172', 'grad_norm': '0.3958', 'learning_rate': '1.172e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39297024, 'tokens/trainable': 38855840, 'epoch': '7.031'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4797/5680 [11:53:20<1:55:32,  7.85s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4798/5680 [11:53:28<1:55:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4566', 'grad_norm': '0.4323', 'learning_rate': '1.169e-05', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 39305216, 'tokens/trainable': 38863960, 'epoch': '7.031'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4798/5680 [11:53:28<1:55:38,  7.87s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4799/5680 [11:53:36<1:55:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4534', 'grad_norm': '0.4327', 'learning_rate': '1.166e-05', 'ppl': '1.574', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39313408, 'tokens/trainable': 38872132, 'epoch': '7.032'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 4799/5680 [11:53:36<1:55:30,  7.87s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 4800/5680 [11:53:44<1:55:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2902', 'grad_norm': '0.4064', 'learning_rate': '1.164e-05', 'ppl': '1.337', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39321600, 'tokens/trainable': 38880288, 'epoch': '7.032'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 4800/5680 [11:53:44<1:55:24,  7.87s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 4801/5680 [11:53:52<1:55:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3403', 'grad_norm': '0.3975', 'learning_rate': '1.161e-05', 'ppl': '1.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 39329792, 'tokens/trainable': 38888376, 'epoch': '7.032'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 4801/5680 [11:53:52<1:55:13,  7.87s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 4802/5680 [11:54:00<1:54:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.539', 'grad_norm': '0.4564', 'learning_rate': '1.159e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 39337984, 'tokens/trainable': 38896528, 'epoch': '7.032'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 4802/5680 [11:54:00<1:54:53,  7.85s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4803/5680 [11:54:07<1:54:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2473', 'grad_norm': '0.3346', 'learning_rate': '1.156e-05', 'ppl': '1.281', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 39346176, 'tokens/trainable': 38904696, 'epoch': '7.032'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4803/5680 [11:54:07<1:54:51,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4804/5680 [11:54:15<1:54:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3582', 'grad_norm': '0.3844', 'learning_rate': '1.154e-05', 'ppl': '1.431', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39354368, 'tokens/trainable': 38912876, 'epoch': '7.032'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4804/5680 [11:54:15<1:54:44,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4805/5680 [11:54:23<1:54:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5224', 'grad_norm': '0.4459', 'learning_rate': '1.151e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 39362560, 'tokens/trainable': 38921044, 'epoch': '7.033'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4805/5680 [11:54:23<1:54:39,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4806/5680 [11:54:31<1:54:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5728', 'grad_norm': '0.4098', 'learning_rate': '1.148e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 39370752, 'tokens/trainable': 38929228, 'epoch': '7.033'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 4806/5680 [11:54:31<1:54:26,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4807/5680 [11:54:39<1:54:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5455', 'grad_norm': '0.4069', 'learning_rate': '1.146e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 39378944, 'tokens/trainable': 38937372, 'epoch': '7.033'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4807/5680 [11:54:39<1:54:23,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4808/5680 [11:54:47<1:54:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5191', 'grad_norm': '0.4473', 'learning_rate': '1.143e-05', 'ppl': '1.681', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39387136, 'tokens/trainable': 38945532, 'epoch': '7.033'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4808/5680 [11:54:47<1:54:18,  7.87s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4809/5680 [11:54:55<1:54:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4876', 'grad_norm': '0.3972', 'learning_rate': '1.141e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39395328, 'tokens/trainable': 38953712, 'epoch': '7.033'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4809/5680 [11:54:55<1:54:08,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4810/5680 [11:55:02<1:54:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3723', 'grad_norm': '0.4199', 'learning_rate': '1.138e-05', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 39403520, 'tokens/trainable': 38961832, 'epoch': '7.033'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 4810/5680 [11:55:02<1:54:02,  7.86s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 4811/5680 [11:55:10<1:53:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5232', 'grad_norm': '0.4213', 'learning_rate': '1.136e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39411712, 'tokens/trainable': 38970012, 'epoch': '7.034'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 4811/5680 [11:55:10<1:53:56,  7.87s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 4812/5680 [11:55:18<1:53:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3279', 'grad_norm': '0.3803', 'learning_rate': '1.133e-05', 'ppl': '1.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 39419904, 'tokens/trainable': 38978144, 'epoch': '7.034'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 4812/5680 [11:55:18<1:53:48,  7.87s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 4813/5680 [11:55:26<1:54:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3648', 'grad_norm': '0.3565', 'learning_rate': '1.13e-05', 'ppl': '1.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 39428096, 'tokens/trainable': 38986280, 'epoch': '7.034'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 4813/5680 [11:55:26<1:54:47,  7.94s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4814/5680 [11:55:34<1:54:29,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.395', 'grad_norm': '0.4086', 'learning_rate': '1.128e-05', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39436288, 'tokens/trainable': 38994464, 'epoch': '7.034'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4814/5680 [11:55:34<1:54:29,  7.93s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4815/5680 [11:55:42<1:54:00,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.2412', 'grad_norm': '0.3806', 'learning_rate': '1.125e-05', 'ppl': '1.273', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39444480, 'tokens/trainable': 39002636, 'epoch': '7.034'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4815/5680 [11:55:42<1:54:00,  7.91s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4816/5680 [11:55:50<1:53:41,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5115', 'grad_norm': '0.3753', 'learning_rate': '1.123e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 39452672, 'tokens/trainable': 39010752, 'epoch': '7.035'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4816/5680 [11:55:50<1:53:41,  7.90s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4817/5680 [11:55:58<1:53:19,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4922', 'grad_norm': '0.5502', 'learning_rate': '1.12e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39460864, 'tokens/trainable': 39018872, 'epoch': '7.035'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 4817/5680 [11:55:58<1:53:19,  7.88s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4818/5680 [11:56:06<1:53:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3429', 'grad_norm': '0.4311', 'learning_rate': '1.118e-05', 'ppl': '1.409', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39469056, 'tokens/trainable': 39027032, 'epoch': '7.035'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4818/5680 [11:56:06<1:53:15,  7.88s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4819/5680 [11:56:14<1:52:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3947', 'grad_norm': '0.4911', 'learning_rate': '1.115e-05', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39477248, 'tokens/trainable': 39035136, 'epoch': '7.035'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4819/5680 [11:56:14<1:52:54,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4820/5680 [11:56:21<1:52:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4655', 'grad_norm': '0.4513', 'learning_rate': '1.113e-05', 'ppl': '1.593', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 39485440, 'tokens/trainable': 39043204, 'epoch': '7.035'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4820/5680 [11:56:21<1:52:44,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4821/5680 [11:56:29<1:52:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5115', 'grad_norm': '0.432', 'learning_rate': '1.11e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 39493632, 'tokens/trainable': 39051332, 'epoch': '7.035'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 4821/5680 [11:56:29<1:52:27,  7.85s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4822/5680 [11:56:37<1:52:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5559', 'grad_norm': '0.4548', 'learning_rate': '1.108e-05', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39501824, 'tokens/trainable': 39059464, 'epoch': '7.036'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4822/5680 [11:56:37<1:52:19,  7.86s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4823/5680 [11:56:45<1:51:58,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4457', 'grad_norm': '0.4149', 'learning_rate': '1.105e-05', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 39510016, 'tokens/trainable': 39067608, 'epoch': '7.036'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4823/5680 [11:56:45<1:51:58,  7.84s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4824/5680 [11:56:53<1:51:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4242', 'grad_norm': '0.4058', 'learning_rate': '1.103e-05', 'ppl': '1.528', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39518208, 'tokens/trainable': 39075784, 'epoch': '7.036'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4824/5680 [11:56:53<1:51:58,  7.85s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4825/5680 [11:57:01<1:51:38,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.389', 'grad_norm': '0.3847', 'learning_rate': '1.1e-05', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39526400, 'tokens/trainable': 39083904, 'epoch': '7.036'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4825/5680 [11:57:01<1:51:38,  7.83s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 4826/5680 [11:57:08<1:51:30,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5479', 'grad_norm': '0.4247', 'learning_rate': '1.097e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 39534592, 'tokens/trainable': 39092064, 'epoch': '7.036'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 4826/5680 [11:57:08<1:51:30,  7.83s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 4827/5680 [11:57:16<1:51:23,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3129', 'grad_norm': '0.4007', 'learning_rate': '1.095e-05', 'ppl': '1.367', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39542784, 'tokens/trainable': 39100224, 'epoch': '7.036'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 4827/5680 [11:57:16<1:51:23,  7.84s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 4828/5680 [11:57:24<1:51:20,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5377', 'grad_norm': '0.5525', 'learning_rate': '1.092e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 39550976, 'tokens/trainable': 39108404, 'epoch': '7.037'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 4828/5680 [11:57:24<1:51:20,  7.84s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4829/5680 [11:57:32<1:51:19,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4476', 'grad_norm': '0.3829', 'learning_rate': '1.09e-05', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39559168, 'tokens/trainable': 39116548, 'epoch': '7.037'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4829/5680 [11:57:32<1:51:19,  7.85s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4830/5680 [11:57:40<1:51:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.618', 'grad_norm': '0.4063', 'learning_rate': '1.087e-05', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 39567360, 'tokens/trainable': 39124712, 'epoch': '7.037'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4830/5680 [11:57:40<1:51:16,  7.85s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4831/5680 [11:57:48<1:51:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3405', 'grad_norm': '0.4147', 'learning_rate': '1.085e-05', 'ppl': '1.406', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39575552, 'tokens/trainable': 39132872, 'epoch': '7.037'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4831/5680 [11:57:48<1:51:20,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4832/5680 [11:57:56<1:51:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3469', 'grad_norm': '0.4421', 'learning_rate': '1.082e-05', 'ppl': '1.415', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39583744, 'tokens/trainable': 39141060, 'epoch': '7.037'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 4832/5680 [11:57:56<1:51:15,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4833/5680 [11:58:03<1:51:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5936', 'grad_norm': '0.4674', 'learning_rate': '1.08e-05', 'ppl': '1.811', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39591936, 'tokens/trainable': 39149216, 'epoch': '7.037'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4833/5680 [11:58:03<1:51:11,  7.88s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4834/5680 [11:58:11<1:50:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4676', 'grad_norm': '0.4067', 'learning_rate': '1.077e-05', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39600128, 'tokens/trainable': 39157340, 'epoch': '7.038'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4834/5680 [11:58:11<1:50:54,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4835/5680 [11:58:19<1:50:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3017', 'grad_norm': '0.3853', 'learning_rate': '1.075e-05', 'ppl': '1.352', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39608320, 'tokens/trainable': 39165472, 'epoch': '7.038'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4835/5680 [11:58:19<1:50:44,  7.86s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4836/5680 [11:58:27<1:50:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3191', 'grad_norm': '0.3492', 'learning_rate': '1.072e-05', 'ppl': '1.376', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 39616512, 'tokens/trainable': 39173576, 'epoch': '7.038'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4836/5680 [11:58:27<1:50:39,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 4837/5680 [11:58:35<1:50:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3343', 'grad_norm': '0.4386', 'learning_rate': '1.07e-05', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39624704, 'tokens/trainable': 39181724, 'epoch': '7.038'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 4837/5680 [11:58:35<1:50:21,  7.86s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 4838/5680 [11:58:43<1:50:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7774', 'grad_norm': '0.4442', 'learning_rate': '1.067e-05', 'ppl': '2.176', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 39632896, 'tokens/trainable': 39189816, 'epoch': '7.038'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 4838/5680 [11:58:43<1:50:19,  7.86s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 4839/5680 [11:58:51<1:50:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3321', 'grad_norm': '0.372', 'learning_rate': '1.065e-05', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 39641088, 'tokens/trainable': 39197936, 'epoch': '7.039'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 4839/5680 [11:58:51<1:50:24,  7.88s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4840/5680 [11:58:59<1:50:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4752', 'grad_norm': '0.4477', 'learning_rate': '1.062e-05', 'ppl': '1.608', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39649280, 'tokens/trainable': 39206080, 'epoch': '7.039'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4840/5680 [11:58:59<1:50:13,  7.87s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4841/5680 [11:59:07<1:51:11,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.6916', 'grad_norm': '0.4945', 'learning_rate': '1.06e-05', 'ppl': '1.997', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 39657472, 'tokens/trainable': 39214232, 'epoch': '7.039'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4841/5680 [11:59:07<1:51:11,  7.95s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4842/5680 [11:59:15<1:50:39,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3263', 'grad_norm': '0.3549', 'learning_rate': '1.058e-05', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39665664, 'tokens/trainable': 39222380, 'epoch': '7.039'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4842/5680 [11:59:15<1:50:39,  7.92s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4843/5680 [11:59:22<1:50:21,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3151', 'grad_norm': '0.3754', 'learning_rate': '1.055e-05', 'ppl': '1.37', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39673856, 'tokens/trainable': 39230536, 'epoch': '7.039'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4843/5680 [11:59:22<1:50:21,  7.91s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4844/5680 [11:59:30<1:49:55,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5799', 'grad_norm': '0.4213', 'learning_rate': '1.053e-05', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 39682048, 'tokens/trainable': 39238708, 'epoch': '7.039'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4844/5680 [11:59:30<1:49:55,  7.89s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4845/5680 [11:59:38<1:49:36,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3307', 'grad_norm': '0.4405', 'learning_rate': '1.05e-05', 'ppl': '1.392', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39690240, 'tokens/trainable': 39246812, 'epoch': '7.04'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4845/5680 [11:59:38<1:49:36,  7.88s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4846/5680 [11:59:46<1:49:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3631', 'grad_norm': '0.3724', 'learning_rate': '1.048e-05', 'ppl': '1.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 39698432, 'tokens/trainable': 39254960, 'epoch': '7.04'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4846/5680 [11:59:46<1:49:29,  7.88s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4847/5680 [11:59:54<1:49:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3394', 'grad_norm': '0.3954', 'learning_rate': '1.045e-05', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 39706624, 'tokens/trainable': 39263072, 'epoch': '7.04'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 4847/5680 [11:59:54<1:49:13,  7.87s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4848/5680 [12:00:02<1:48:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3434', 'grad_norm': '0.3898', 'learning_rate': '1.043e-05', 'ppl': '1.41', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 39714816, 'tokens/trainable': 39271228, 'epoch': '7.04'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4848/5680 [12:00:02<1:48:51,  7.85s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4849/5680 [12:00:09<1:48:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4672', 'grad_norm': '0.3996', 'learning_rate': '1.04e-05', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39723008, 'tokens/trainable': 39279360, 'epoch': '7.04'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4849/5680 [12:00:09<1:48:44,  7.85s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4850/5680 [12:00:17<1:48:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4779', 'grad_norm': '0.3991', 'learning_rate': '1.038e-05', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39731200, 'tokens/trainable': 39287540, 'epoch': '7.04'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4850/5680 [12:00:17<1:48:45,  7.86s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4851/5680 [12:00:25<1:48:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4986', 'grad_norm': '0.4223', 'learning_rate': '1.035e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39739392, 'tokens/trainable': 39295700, 'epoch': '7.041'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 4851/5680 [12:00:25<1:48:33,  7.86s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 4852/5680 [12:00:33<1:48:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4882', 'grad_norm': '0.4267', 'learning_rate': '1.033e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39747584, 'tokens/trainable': 39303820, 'epoch': '7.041'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 4852/5680 [12:00:33<1:48:20,  7.85s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 4853/5680 [12:00:41<1:48:19,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.488', 'grad_norm': '0.4146', 'learning_rate': '1.03e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 39755776, 'tokens/trainable': 39311904, 'epoch': '7.041'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 4853/5680 [12:00:41<1:48:19,  7.86s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 4854/5680 [12:00:49<1:48:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4834', 'grad_norm': '0.4086', 'learning_rate': '1.028e-05', 'ppl': '1.622', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39763968, 'tokens/trainable': 39320012, 'epoch': '7.041'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 4854/5680 [12:00:49<1:48:09,  7.86s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4855/5680 [12:00:57<1:47:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6127', 'grad_norm': '0.4537', 'learning_rate': '1.026e-05', 'ppl': '1.845', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39772160, 'tokens/trainable': 39328148, 'epoch': '7.041'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4855/5680 [12:00:57<1:47:53,  7.85s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4856/5680 [12:01:04<1:47:41,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4831', 'grad_norm': '0.4087', 'learning_rate': '1.023e-05', 'ppl': '1.621', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39780352, 'tokens/trainable': 39336268, 'epoch': '7.042'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4856/5680 [12:01:04<1:47:41,  7.84s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4857/5680 [12:01:12<1:47:29,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3988', 'grad_norm': '0.3917', 'learning_rate': '1.021e-05', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 39788544, 'tokens/trainable': 39344448, 'epoch': '7.042'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4857/5680 [12:01:12<1:47:29,  7.84s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4858/5680 [12:01:20<1:47:17,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4252', 'grad_norm': '0.4772', 'learning_rate': '1.018e-05', 'ppl': '1.53', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39796736, 'tokens/trainable': 39352584, 'epoch': '7.042'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 4858/5680 [12:01:20<1:47:17,  7.83s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4859/5680 [12:01:28<1:47:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5475', 'grad_norm': '0.4592', 'learning_rate': '1.016e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 39804928, 'tokens/trainable': 39360720, 'epoch': '7.042'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4859/5680 [12:01:28<1:47:25,  7.85s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4860/5680 [12:01:36<1:47:18,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5913', 'grad_norm': '0.4609', 'learning_rate': '1.013e-05', 'ppl': '1.806', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39813120, 'tokens/trainable': 39368892, 'epoch': '7.042'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4860/5680 [12:01:36<1:47:18,  7.85s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4861/5680 [12:01:44<1:47:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4529', 'grad_norm': '0.4485', 'learning_rate': '1.011e-05', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 39821312, 'tokens/trainable': 39377044, 'epoch': '7.042'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4861/5680 [12:01:44<1:47:23,  7.87s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4862/5680 [12:01:52<1:47:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4705', 'grad_norm': '0.4018', 'learning_rate': '1.009e-05', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 39829504, 'tokens/trainable': 39385196, 'epoch': '7.043'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 4862/5680 [12:01:52<1:47:23,  7.88s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4863/5680 [12:02:00<1:47:20,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3209', 'grad_norm': '0.3843', 'learning_rate': '1.006e-05', 'ppl': '1.378', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39837696, 'tokens/trainable': 39393376, 'epoch': '7.043'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4863/5680 [12:02:00<1:47:20,  7.88s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4864/5680 [12:02:07<1:47:13,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5277', 'grad_norm': '0.4061', 'learning_rate': '1.004e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39845888, 'tokens/trainable': 39401544, 'epoch': '7.043'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4864/5680 [12:02:07<1:47:13,  7.88s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4865/5680 [12:02:15<1:47:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4619', 'grad_norm': '0.4326', 'learning_rate': '1.001e-05', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 39854080, 'tokens/trainable': 39409728, 'epoch': '7.043'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4865/5680 [12:02:15<1:47:07,  7.89s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4866/5680 [12:02:23<1:46:52,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4052', 'grad_norm': '0.4168', 'learning_rate': '9.989e-06', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 39862272, 'tokens/trainable': 39417808, 'epoch': '7.043'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4866/5680 [12:02:23<1:46:52,  7.88s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4867/5680 [12:02:31<1:46:42,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6181', 'grad_norm': '0.4402', 'learning_rate': '9.965e-06', 'ppl': '1.855', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39870464, 'tokens/trainable': 39425984, 'epoch': '7.043'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4867/5680 [12:02:31<1:46:42,  7.88s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4868/5680 [12:02:39<1:46:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3466', 'grad_norm': '0.4048', 'learning_rate': '9.941e-06', 'ppl': '1.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39878656, 'tokens/trainable': 39434120, 'epoch': '7.044'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4868/5680 [12:02:39<1:46:31,  7.87s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4869/5680 [12:02:47<1:46:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3762', 'grad_norm': '0.4397', 'learning_rate': '9.917e-06', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39886848, 'tokens/trainable': 39442288, 'epoch': '7.044'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4869/5680 [12:02:47<1:46:29,  7.88s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4870/5680 [12:02:55<1:46:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4102', 'grad_norm': '0.4195', 'learning_rate': '9.893e-06', 'ppl': '1.507', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 39895040, 'tokens/trainable': 39450456, 'epoch': '7.044'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4870/5680 [12:02:55<1:46:10,  7.86s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4871/5680 [12:03:02<1:45:44,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3585', 'grad_norm': '0.3896', 'learning_rate': '9.869e-06', 'ppl': '1.431', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 39903232, 'tokens/trainable': 39458584, 'epoch': '7.044'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4871/5680 [12:03:02<1:45:44,  7.84s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4872/5680 [12:03:10<1:45:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4072', 'grad_norm': '0.412', 'learning_rate': '9.845e-06', 'ppl': '1.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39911424, 'tokens/trainable': 39466752, 'epoch': '7.044'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4872/5680 [12:03:10<1:45:47,  7.86s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4873/5680 [12:03:18<1:45:36,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.431', 'grad_norm': '0.3893', 'learning_rate': '9.821e-06', 'ppl': '1.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 39919616, 'tokens/trainable': 39474908, 'epoch': '7.045'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4873/5680 [12:03:18<1:45:36,  7.85s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4874/5680 [12:03:26<1:45:16,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4309', 'grad_norm': '0.4343', 'learning_rate': '9.797e-06', 'ppl': '1.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 39927808, 'tokens/trainable': 39483064, 'epoch': '7.045'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4874/5680 [12:03:26<1:45:16,  7.84s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4875/5680 [12:03:34<1:45:08,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.52', 'grad_norm': '0.4481', 'learning_rate': '9.773e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 39936000, 'tokens/trainable': 39491224, 'epoch': '7.045'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4875/5680 [12:03:34<1:45:08,  7.84s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4876/5680 [12:03:42<1:45:04,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5935', 'grad_norm': '0.5003', 'learning_rate': '9.749e-06', 'ppl': '1.81', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 39944192, 'tokens/trainable': 39499408, 'epoch': '7.045'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4876/5680 [12:03:42<1:45:04,  7.84s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4877/5680 [12:03:50<1:45:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3615', 'grad_norm': '0.3978', 'learning_rate': '9.726e-06', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39952384, 'tokens/trainable': 39507568, 'epoch': '7.045'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 4877/5680 [12:03:50<1:45:06,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4878/5680 [12:03:57<1:45:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.393', 'grad_norm': '0.368', 'learning_rate': '9.702e-06', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39960576, 'tokens/trainable': 39515736, 'epoch': '7.045'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4878/5680 [12:03:57<1:45:00,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4879/5680 [12:04:05<1:44:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.477', 'grad_norm': '0.452', 'learning_rate': '9.678e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 39968768, 'tokens/trainable': 39523864, 'epoch': '7.046'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4879/5680 [12:04:05<1:44:51,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4880/5680 [12:04:13<1:44:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4241', 'grad_norm': '0.4529', 'learning_rate': '9.654e-06', 'ppl': '1.528', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 39976960, 'tokens/trainable': 39532040, 'epoch': '7.046'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4880/5680 [12:04:13<1:44:45,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4881/5680 [12:04:21<1:44:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5118', 'grad_norm': '0.4407', 'learning_rate': '9.631e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 39985152, 'tokens/trainable': 39540192, 'epoch': '7.046'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4881/5680 [12:04:21<1:44:41,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4882/5680 [12:04:29<1:44:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3435', 'grad_norm': '0.425', 'learning_rate': '9.607e-06', 'ppl': '1.41', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 39993344, 'tokens/trainable': 39548340, 'epoch': '7.046'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4882/5680 [12:04:29<1:44:30,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4883/5680 [12:04:37<1:44:03,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4187', 'grad_norm': '0.3996', 'learning_rate': '9.583e-06', 'ppl': '1.52', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1049', 'tokens/total': 40001536, 'tokens/trainable': 39556496, 'epoch': '7.046'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4883/5680 [12:04:37<1:44:03,  7.83s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4884/5680 [12:04:44<1:44:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2863', 'grad_norm': '0.3813', 'learning_rate': '9.56e-06', 'ppl': '1.331', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 40009728, 'tokens/trainable': 39564616, 'epoch': '7.046'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 4884/5680 [12:04:44<1:44:10,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4885/5680 [12:04:52<1:43:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5106', 'grad_norm': '0.4087', 'learning_rate': '9.536e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40017920, 'tokens/trainable': 39572744, 'epoch': '7.047'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4885/5680 [12:04:52<1:43:58,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4886/5680 [12:05:00<1:43:42,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.549', 'grad_norm': '0.409', 'learning_rate': '9.513e-06', 'ppl': '1.732', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 40026112, 'tokens/trainable': 39580900, 'epoch': '7.047'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4886/5680 [12:05:00<1:43:42,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4887/5680 [12:05:08<1:43:37,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6136', 'grad_norm': '0.463', 'learning_rate': '9.489e-06', 'ppl': '1.847', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 40034304, 'tokens/trainable': 39589080, 'epoch': '7.047'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4887/5680 [12:05:08<1:43:37,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4888/5680 [12:05:16<1:43:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3331', 'grad_norm': '0.393', 'learning_rate': '9.466e-06', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40042496, 'tokens/trainable': 39597236, 'epoch': '7.047'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 4888/5680 [12:05:16<1:43:35,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4889/5680 [12:05:24<1:43:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4924', 'grad_norm': '0.4656', 'learning_rate': '9.442e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 40050688, 'tokens/trainable': 39605420, 'epoch': '7.047'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4889/5680 [12:05:24<1:43:25,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4890/5680 [12:05:32<1:43:12,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5718', 'grad_norm': '0.4349', 'learning_rate': '9.419e-06', 'ppl': '1.771', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40058880, 'tokens/trainable': 39613488, 'epoch': '7.048'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4890/5680 [12:05:32<1:43:12,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4891/5680 [12:05:39<1:43:04,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.327', 'grad_norm': '0.4143', 'learning_rate': '9.395e-06', 'ppl': '1.387', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40067072, 'tokens/trainable': 39621624, 'epoch': '7.048'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 4891/5680 [12:05:39<1:43:04,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4892/5680 [12:05:47<1:42:48,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4846', 'grad_norm': '0.4801', 'learning_rate': '9.372e-06', 'ppl': '1.623', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40075264, 'tokens/trainable': 39629716, 'epoch': '7.048'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4892/5680 [12:05:47<1:42:48,  7.83s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4893/5680 [12:05:55<1:42:51,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.436', 'grad_norm': '0.4268', 'learning_rate': '9.348e-06', 'ppl': '1.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40083456, 'tokens/trainable': 39637872, 'epoch': '7.048'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4893/5680 [12:05:55<1:42:51,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4894/5680 [12:06:03<1:42:45,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3483', 'grad_norm': '0.3938', 'learning_rate': '9.325e-06', 'ppl': '1.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40091648, 'tokens/trainable': 39646000, 'epoch': '7.048'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4894/5680 [12:06:03<1:42:45,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4895/5680 [12:06:11<1:42:37,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5793', 'grad_norm': '0.429', 'learning_rate': '9.302e-06', 'ppl': '1.785', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 40099840, 'tokens/trainable': 39654160, 'epoch': '7.048'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4895/5680 [12:06:11<1:42:37,  7.84s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4896/5680 [12:06:19<1:42:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5017', 'grad_norm': '0.4492', 'learning_rate': '9.279e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40108032, 'tokens/trainable': 39662324, 'epoch': '7.049'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4896/5680 [12:06:19<1:42:31,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4897/5680 [12:06:26<1:42:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4189', 'grad_norm': '0.4468', 'learning_rate': '9.255e-06', 'ppl': '1.52', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 40116224, 'tokens/trainable': 39670496, 'epoch': '7.049'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4897/5680 [12:06:26<1:42:29,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4898/5680 [12:06:34<1:42:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3264', 'grad_norm': '0.4086', 'learning_rate': '9.232e-06', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40124416, 'tokens/trainable': 39678664, 'epoch': '7.049'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4898/5680 [12:06:34<1:42:26,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4899/5680 [12:06:42<1:42:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5144', 'grad_norm': '0.4476', 'learning_rate': '9.209e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 40132608, 'tokens/trainable': 39686768, 'epoch': '7.049'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 4899/5680 [12:06:42<1:42:17,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4900/5680 [12:06:50<1:42:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4196', 'grad_norm': '0.3881', 'learning_rate': '9.186e-06', 'ppl': '1.521', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40140800, 'tokens/trainable': 39694888, 'epoch': '7.049'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4900/5680 [12:06:50<1:42:08,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4901/5680 [12:06:58<1:42:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4936', 'grad_norm': '0.4468', 'learning_rate': '9.163e-06', 'ppl': '1.638', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40148992, 'tokens/trainable': 39703048, 'epoch': '7.049'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4901/5680 [12:06:58<1:42:04,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4902/5680 [12:07:06<1:41:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5402', 'grad_norm': '0.4956', 'learning_rate': '9.139e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 40157184, 'tokens/trainable': 39711224, 'epoch': '7.05'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4902/5680 [12:07:06<1:41:50,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4903/5680 [12:07:14<1:41:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4026', 'grad_norm': '0.4417', 'learning_rate': '9.116e-06', 'ppl': '1.496', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 40165376, 'tokens/trainable': 39719372, 'epoch': '7.05'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 4903/5680 [12:07:14<1:41:57,  7.87s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 4904/5680 [12:07:21<1:41:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2584', 'grad_norm': '0.3535', 'learning_rate': '9.093e-06', 'ppl': '1.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40173568, 'tokens/trainable': 39727508, 'epoch': '7.05'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 4904/5680 [12:07:21<1:41:43,  7.86s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 4905/5680 [12:07:29<1:41:24,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2432', 'grad_norm': '0.3661', 'learning_rate': '9.07e-06', 'ppl': '1.275', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40181760, 'tokens/trainable': 39735588, 'epoch': '7.05'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 4905/5680 [12:07:29<1:41:24,  7.85s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 4906/5680 [12:07:37<1:41:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4706', 'grad_norm': '0.4994', 'learning_rate': '9.047e-06', 'ppl': '1.601', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 40189952, 'tokens/trainable': 39743764, 'epoch': '7.05'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 4906/5680 [12:07:37<1:41:16,  7.85s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4907/5680 [12:07:45<1:41:03,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3692', 'grad_norm': '0.4004', 'learning_rate': '9.024e-06', 'ppl': '1.447', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 40198144, 'tokens/trainable': 39751928, 'epoch': '7.051'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4907/5680 [12:07:45<1:41:03,  7.84s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4908/5680 [12:07:53<1:41:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5216', 'grad_norm': '0.4025', 'learning_rate': '9.001e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40206336, 'tokens/trainable': 39760100, 'epoch': '7.051'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4908/5680 [12:07:53<1:41:04,  7.86s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4909/5680 [12:08:01<1:41:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.289', 'grad_norm': '0.3487', 'learning_rate': '8.978e-06', 'ppl': '1.335', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40214528, 'tokens/trainable': 39768280, 'epoch': '7.051'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4909/5680 [12:08:01<1:41:05,  7.87s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4910/5680 [12:08:09<1:41:04,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4484', 'grad_norm': '0.4331', 'learning_rate': '8.956e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 40222720, 'tokens/trainable': 39776408, 'epoch': '7.051'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 4910/5680 [12:08:09<1:41:04,  7.88s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4911/5680 [12:08:17<1:41:00,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4476', 'grad_norm': '0.4368', 'learning_rate': '8.933e-06', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40230912, 'tokens/trainable': 39784592, 'epoch': '7.051'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4911/5680 [12:08:17<1:41:00,  7.88s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4912/5680 [12:08:24<1:40:48,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5396', 'grad_norm': '0.5569', 'learning_rate': '8.91e-06', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 40239104, 'tokens/trainable': 39792776, 'epoch': '7.051'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4912/5680 [12:08:24<1:40:48,  7.88s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4913/5680 [12:08:33<1:41:33,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2546', 'grad_norm': '0.3803', 'learning_rate': '8.887e-06', 'ppl': '1.29', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 40247296, 'tokens/trainable': 39800952, 'epoch': '7.052'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4913/5680 [12:08:33<1:41:33,  7.94s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4914/5680 [12:08:40<1:41:22,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6107', 'grad_norm': '0.4879', 'learning_rate': '8.864e-06', 'ppl': '1.842', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 40255488, 'tokens/trainable': 39809140, 'epoch': '7.052'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 4914/5680 [12:08:40<1:41:22,  7.94s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4915/5680 [12:08:48<1:40:58,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3458', 'grad_norm': '0.4108', 'learning_rate': '8.841e-06', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40263680, 'tokens/trainable': 39817292, 'epoch': '7.052'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4915/5680 [12:08:48<1:40:58,  7.92s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4916/5680 [12:08:56<1:40:28,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5592', 'grad_norm': '0.4309', 'learning_rate': '8.819e-06', 'ppl': '1.749', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 40271872, 'tokens/trainable': 39825460, 'epoch': '7.052'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4916/5680 [12:08:56<1:40:28,  7.89s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4917/5680 [12:09:04<1:40:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4151', 'grad_norm': '0.4225', 'learning_rate': '8.796e-06', 'ppl': '1.515', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40280064, 'tokens/trainable': 39833600, 'epoch': '7.052'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4917/5680 [12:09:04<1:40:14,  7.88s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4918/5680 [12:09:12<1:40:06,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3835', 'grad_norm': '0.3925', 'learning_rate': '8.773e-06', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40288256, 'tokens/trainable': 39841776, 'epoch': '7.052'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4918/5680 [12:09:12<1:40:06,  7.88s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4919/5680 [12:09:20<1:39:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3714', 'grad_norm': '0.3696', 'learning_rate': '8.751e-06', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40296448, 'tokens/trainable': 39849928, 'epoch': '7.053'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4919/5680 [12:09:20<1:39:52,  7.87s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4920/5680 [12:09:28<1:39:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2598', 'grad_norm': '0.3972', 'learning_rate': '8.728e-06', 'ppl': '1.297', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40304640, 'tokens/trainable': 39858044, 'epoch': '7.053'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4920/5680 [12:09:28<1:39:39,  7.87s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4921/5680 [12:09:35<1:39:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6376', 'grad_norm': '0.4282', 'learning_rate': '8.706e-06', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 40312832, 'tokens/trainable': 39866196, 'epoch': '7.053'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 4921/5680 [12:09:35<1:39:22,  7.86s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4922/5680 [12:09:43<1:39:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4743', 'grad_norm': '0.4312', 'learning_rate': '8.683e-06', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40321024, 'tokens/trainable': 39874304, 'epoch': '7.053'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4922/5680 [12:09:43<1:39:06,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4923/5680 [12:09:51<1:39:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3382', 'grad_norm': '0.3469', 'learning_rate': '8.66e-06', 'ppl': '1.402', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40329216, 'tokens/trainable': 39882440, 'epoch': '7.053'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4923/5680 [12:09:51<1:39:00,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4924/5680 [12:09:59<1:38:55,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5447', 'grad_norm': '0.4422', 'learning_rate': '8.638e-06', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 40337408, 'tokens/trainable': 39890628, 'epoch': '7.054'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4924/5680 [12:09:59<1:38:55,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4925/5680 [12:10:07<1:38:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5329', 'grad_norm': '0.4392', 'learning_rate': '8.615e-06', 'ppl': '1.704', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40345600, 'tokens/trainable': 39898788, 'epoch': '7.054'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4925/5680 [12:10:07<1:38:46,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4926/5680 [12:10:15<1:38:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4779', 'grad_norm': '0.432', 'learning_rate': '8.593e-06', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 40353792, 'tokens/trainable': 39906856, 'epoch': '7.054'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4926/5680 [12:10:15<1:38:35,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4927/5680 [12:10:23<1:38:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.523', 'grad_norm': '0.4632', 'learning_rate': '8.571e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40361984, 'tokens/trainable': 39915008, 'epoch': '7.054'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4927/5680 [12:10:23<1:38:32,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4928/5680 [12:10:30<1:38:17,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5054', 'grad_norm': '0.4717', 'learning_rate': '8.548e-06', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 40370176, 'tokens/trainable': 39923152, 'epoch': '7.054'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4928/5680 [12:10:30<1:38:17,  7.84s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4929/5680 [12:10:38<1:38:18,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4921', 'grad_norm': '0.3939', 'learning_rate': '8.526e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 40378368, 'tokens/trainable': 39931336, 'epoch': '7.054'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 4929/5680 [12:10:38<1:38:18,  7.85s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4930/5680 [12:10:46<1:38:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5929', 'grad_norm': '0.4837', 'learning_rate': '8.504e-06', 'ppl': '1.809', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40386560, 'tokens/trainable': 39939472, 'epoch': '7.055'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4930/5680 [12:10:46<1:38:13,  7.86s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4931/5680 [12:10:54<1:38:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3616', 'grad_norm': '0.4026', 'learning_rate': '8.481e-06', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 40394752, 'tokens/trainable': 39947612, 'epoch': '7.055'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4931/5680 [12:10:54<1:38:16,  7.87s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4932/5680 [12:11:02<1:38:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3561', 'grad_norm': '0.4195', 'learning_rate': '8.459e-06', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40402944, 'tokens/trainable': 39955800, 'epoch': '7.055'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4932/5680 [12:11:02<1:38:15,  7.88s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4933/5680 [12:11:10<1:38:07,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3172', 'grad_norm': '0.3995', 'learning_rate': '8.437e-06', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 40411136, 'tokens/trainable': 39963936, 'epoch': '7.055'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4933/5680 [12:11:10<1:38:07,  7.88s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4934/5680 [12:11:18<1:37:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4415', 'grad_norm': '0.4515', 'learning_rate': '8.414e-06', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40419328, 'tokens/trainable': 39972088, 'epoch': '7.055'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4934/5680 [12:11:18<1:37:56,  7.88s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4935/5680 [12:11:26<1:37:48,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3705', 'grad_norm': '0.4336', 'learning_rate': '8.392e-06', 'ppl': '1.449', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40427520, 'tokens/trainable': 39980236, 'epoch': '7.055'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4935/5680 [12:11:26<1:37:48,  7.88s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4936/5680 [12:11:33<1:37:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4351', 'grad_norm': '0.4748', 'learning_rate': '8.37e-06', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40435712, 'tokens/trainable': 39988400, 'epoch': '7.056'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 4936/5680 [12:11:33<1:37:37,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4937/5680 [12:11:41<1:37:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6184', 'grad_norm': '0.451', 'learning_rate': '8.348e-06', 'ppl': '1.856', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 40443904, 'tokens/trainable': 39996556, 'epoch': '7.056'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4937/5680 [12:11:41<1:37:17,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4938/5680 [12:11:49<1:37:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3212', 'grad_norm': '0.4091', 'learning_rate': '8.326e-06', 'ppl': '1.379', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40452096, 'tokens/trainable': 40004744, 'epoch': '7.056'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4938/5680 [12:11:49<1:37:21,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4939/5680 [12:11:57<1:37:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4471', 'grad_norm': '0.3855', 'learning_rate': '8.304e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40460288, 'tokens/trainable': 40012872, 'epoch': '7.056'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4939/5680 [12:11:57<1:37:01,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4940/5680 [12:12:05<1:36:55,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6322', 'grad_norm': '0.4208', 'learning_rate': '8.282e-06', 'ppl': '1.882', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 40468480, 'tokens/trainable': 40020952, 'epoch': '7.056'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 4940/5680 [12:12:05<1:36:55,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 4941/5680 [12:12:13<1:38:02,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5584', 'grad_norm': '0.4496', 'learning_rate': '8.26e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '984.5', 'tokens/total': 40476672, 'tokens/trainable': 40029020, 'epoch': '7.057'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 4941/5680 [12:12:13<1:38:02,  7.96s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 4942/5680 [12:12:21<1:37:48,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2695', 'grad_norm': '0.4157', 'learning_rate': '8.238e-06', 'ppl': '1.309', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 40484864, 'tokens/trainable': 40037148, 'epoch': '7.057'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 4942/5680 [12:12:21<1:37:48,  7.95s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 4943/5680 [12:12:29<1:37:21,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4721', 'grad_norm': '0.3902', 'learning_rate': '8.216e-06', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40493056, 'tokens/trainable': 40045308, 'epoch': '7.057'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 4943/5680 [12:12:29<1:37:21,  7.93s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4944/5680 [12:12:37<1:37:09,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4531', 'grad_norm': '0.5113', 'learning_rate': '8.194e-06', 'ppl': '1.573', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 40501248, 'tokens/trainable': 40053416, 'epoch': '7.057'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4944/5680 [12:12:37<1:37:09,  7.92s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4945/5680 [12:12:45<1:37:05,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.331', 'grad_norm': '0.3903', 'learning_rate': '8.172e-06', 'ppl': '1.392', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 40509440, 'tokens/trainable': 40061588, 'epoch': '7.057'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4945/5680 [12:12:45<1:37:05,  7.93s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4946/5680 [12:12:52<1:36:36,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4267', 'grad_norm': '0.4423', 'learning_rate': '8.15e-06', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40517632, 'tokens/trainable': 40069708, 'epoch': '7.057'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4946/5680 [12:12:52<1:36:36,  7.90s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4947/5680 [12:13:00<1:36:28,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5972', 'grad_norm': '0.4004', 'learning_rate': '8.128e-06', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40525824, 'tokens/trainable': 40077888, 'epoch': '7.058'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 4947/5680 [12:13:00<1:36:28,  7.90s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4948/5680 [12:13:08<1:36:11,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5811', 'grad_norm': '0.4304', 'learning_rate': '8.106e-06', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 40534016, 'tokens/trainable': 40085996, 'epoch': '7.058'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4948/5680 [12:13:08<1:36:11,  7.89s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4949/5680 [12:13:16<1:36:02,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3904', 'grad_norm': '0.4191', 'learning_rate': '8.085e-06', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40542208, 'tokens/trainable': 40094168, 'epoch': '7.058'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4949/5680 [12:13:16<1:36:02,  7.88s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4950/5680 [12:13:24<1:35:50,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4749', 'grad_norm': '0.4136', 'learning_rate': '8.063e-06', 'ppl': '1.608', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40550400, 'tokens/trainable': 40102324, 'epoch': '7.058'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4950/5680 [12:13:24<1:35:50,  7.88s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4951/5680 [12:13:32<1:35:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4292', 'grad_norm': '0.4556', 'learning_rate': '8.041e-06', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 40558592, 'tokens/trainable': 40110500, 'epoch': '7.058'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4951/5680 [12:13:32<1:35:38,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4952/5680 [12:13:40<1:35:26,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4512', 'grad_norm': '0.4106', 'learning_rate': '8.019e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40566784, 'tokens/trainable': 40118644, 'epoch': '7.058'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4952/5680 [12:13:40<1:35:26,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4953/5680 [12:13:47<1:35:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.455', 'grad_norm': '0.4272', 'learning_rate': '7.998e-06', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 40574976, 'tokens/trainable': 40126784, 'epoch': '7.059'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4953/5680 [12:13:47<1:35:07,  7.85s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4954/5680 [12:13:55<1:35:02,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4376', 'grad_norm': '0.4728', 'learning_rate': '7.976e-06', 'ppl': '1.549', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40583168, 'tokens/trainable': 40134928, 'epoch': '7.059'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4954/5680 [12:13:55<1:35:02,  7.85s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4955/5680 [12:14:03<1:34:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4436', 'grad_norm': '0.3911', 'learning_rate': '7.954e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 40591360, 'tokens/trainable': 40143116, 'epoch': '7.059'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 4955/5680 [12:14:03<1:34:50,  7.85s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 4956/5680 [12:14:11<1:34:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4499', 'grad_norm': '0.4795', 'learning_rate': '7.933e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 40599552, 'tokens/trainable': 40151228, 'epoch': '7.059'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 4956/5680 [12:14:11<1:34:49,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 4957/5680 [12:14:19<1:34:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5445', 'grad_norm': '0.4304', 'learning_rate': '7.911e-06', 'ppl': '1.724', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40607744, 'tokens/trainable': 40159356, 'epoch': '7.059'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 4957/5680 [12:14:19<1:34:41,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 4958/5680 [12:14:27<1:34:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3467', 'grad_norm': '0.4084', 'learning_rate': '7.89e-06', 'ppl': '1.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 40615936, 'tokens/trainable': 40167484, 'epoch': '7.06'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 4958/5680 [12:14:27<1:34:43,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4959/5680 [12:14:35<1:34:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2803', 'grad_norm': '0.4274', 'learning_rate': '7.868e-06', 'ppl': '1.324', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40624128, 'tokens/trainable': 40175576, 'epoch': '7.06'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4959/5680 [12:14:35<1:34:31,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4960/5680 [12:14:43<1:34:23,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4053', 'grad_norm': '0.4595', 'learning_rate': '7.847e-06', 'ppl': '1.5', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40632320, 'tokens/trainable': 40183708, 'epoch': '7.06'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4960/5680 [12:14:43<1:34:23,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4961/5680 [12:14:50<1:34:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4997', 'grad_norm': '0.4587', 'learning_rate': '7.825e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 40640512, 'tokens/trainable': 40191880, 'epoch': '7.06'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4961/5680 [12:14:50<1:34:16,  7.87s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4962/5680 [12:14:58<1:33:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4864', 'grad_norm': '0.4103', 'learning_rate': '7.804e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 40648704, 'tokens/trainable': 40200056, 'epoch': '7.06'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 4962/5680 [12:14:58<1:33:59,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4963/5680 [12:15:06<1:33:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6376', 'grad_norm': '0.4371', 'learning_rate': '7.782e-06', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40656896, 'tokens/trainable': 40208184, 'epoch': '7.06'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4963/5680 [12:15:06<1:33:57,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4964/5680 [12:15:14<1:33:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4717', 'grad_norm': '0.4293', 'learning_rate': '7.761e-06', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40665088, 'tokens/trainable': 40216348, 'epoch': '7.061'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4964/5680 [12:15:14<1:33:47,  7.86s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4965/5680 [12:15:22<1:33:36,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5178', 'grad_norm': '0.4477', 'learning_rate': '7.74e-06', 'ppl': '1.678', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40673280, 'tokens/trainable': 40224500, 'epoch': '7.061'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4965/5680 [12:15:22<1:33:36,  7.85s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4966/5680 [12:15:30<1:33:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2994', 'grad_norm': '0.3702', 'learning_rate': '7.718e-06', 'ppl': '1.349', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40681472, 'tokens/trainable': 40232648, 'epoch': '7.061'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4966/5680 [12:15:30<1:33:39,  7.87s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 4967/5680 [12:15:38<1:33:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4397', 'grad_norm': '0.4185', 'learning_rate': '7.697e-06', 'ppl': '1.552', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40689664, 'tokens/trainable': 40240740, 'epoch': '7.061'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 4967/5680 [12:15:38<1:33:28,  7.87s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 4968/5680 [12:15:45<1:33:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4388', 'grad_norm': '0.3974', 'learning_rate': '7.676e-06', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 40697856, 'tokens/trainable': 40248856, 'epoch': '7.061'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 4968/5680 [12:15:45<1:33:18,  7.86s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 4969/5680 [12:15:53<1:32:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5474', 'grad_norm': '0.4311', 'learning_rate': '7.654e-06', 'ppl': '1.729', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1049', 'tokens/total': 40706048, 'tokens/trainable': 40257040, 'epoch': '7.061'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 4969/5680 [12:15:53<1:32:57,  7.85s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4970/5680 [12:16:01<1:32:49,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2347', 'grad_norm': '0.4271', 'learning_rate': '7.633e-06', 'ppl': '1.264', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40714240, 'tokens/trainable': 40265180, 'epoch': '7.062'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4970/5680 [12:16:01<1:32:49,  7.85s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4971/5680 [12:16:09<1:32:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2229', 'grad_norm': '0.3556', 'learning_rate': '7.612e-06', 'ppl': '1.25', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 40722432, 'tokens/trainable': 40273352, 'epoch': '7.062'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4971/5680 [12:16:09<1:32:46,  7.85s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4972/5680 [12:16:17<1:32:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5681', 'grad_norm': '0.4426', 'learning_rate': '7.591e-06', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 40730624, 'tokens/trainable': 40281516, 'epoch': '7.062'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4972/5680 [12:16:17<1:32:37,  7.85s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4973/5680 [12:16:25<1:32:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5029', 'grad_norm': '0.3961', 'learning_rate': '7.57e-06', 'ppl': '1.653', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40738816, 'tokens/trainable': 40289692, 'epoch': '7.062'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4973/5680 [12:16:25<1:32:34,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4974/5680 [12:16:33<1:32:29,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3123', 'grad_norm': '0.3947', 'learning_rate': '7.549e-06', 'ppl': '1.367', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40747008, 'tokens/trainable': 40297840, 'epoch': '7.062'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4974/5680 [12:16:33<1:32:29,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4975/5680 [12:16:40<1:32:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3277', 'grad_norm': '0.4189', 'learning_rate': '7.528e-06', 'ppl': '1.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 40755200, 'tokens/trainable': 40305960, 'epoch': '7.062'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4975/5680 [12:16:40<1:32:30,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4976/5680 [12:16:48<1:32:26,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5972', 'grad_norm': '0.491', 'learning_rate': '7.507e-06', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40763392, 'tokens/trainable': 40314124, 'epoch': '7.063'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4976/5680 [12:16:48<1:32:26,  7.88s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4977/5680 [12:16:56<1:32:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5603', 'grad_norm': '0.4662', 'learning_rate': '7.486e-06', 'ppl': '1.751', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40771584, 'tokens/trainable': 40322304, 'epoch': '7.063'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 4977/5680 [12:16:56<1:32:15,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4978/5680 [12:17:04<1:32:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7776', 'grad_norm': '0.4688', 'learning_rate': '7.465e-06', 'ppl': '2.176', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40779776, 'tokens/trainable': 40330456, 'epoch': '7.063'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4978/5680 [12:17:04<1:32:03,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4979/5680 [12:17:12<1:31:55,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6893', 'grad_norm': '0.4385', 'learning_rate': '7.444e-06', 'ppl': '1.992', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40787968, 'tokens/trainable': 40338636, 'epoch': '7.063'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4979/5680 [12:17:12<1:31:55,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4980/5680 [12:17:20<1:31:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.496', 'grad_norm': '0.415', 'learning_rate': '7.423e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40796160, 'tokens/trainable': 40346812, 'epoch': '7.063'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4980/5680 [12:17:20<1:31:53,  7.88s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4981/5680 [12:17:28<1:31:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3326', 'grad_norm': '0.4117', 'learning_rate': '7.402e-06', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40804352, 'tokens/trainable': 40354924, 'epoch': '7.064'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 4981/5680 [12:17:28<1:31:39,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4982/5680 [12:17:36<1:31:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2007', 'grad_norm': '0.3671', 'learning_rate': '7.381e-06', 'ppl': '1.222', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 40812544, 'tokens/trainable': 40363056, 'epoch': '7.064'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4982/5680 [12:17:36<1:31:27,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4983/5680 [12:17:43<1:31:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6566', 'grad_norm': '0.5111', 'learning_rate': '7.36e-06', 'ppl': '1.928', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40820736, 'tokens/trainable': 40371168, 'epoch': '7.064'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4983/5680 [12:17:43<1:31:21,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4984/5680 [12:17:51<1:31:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2824', 'grad_norm': '0.3805', 'learning_rate': '7.339e-06', 'ppl': '1.326', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40828928, 'tokens/trainable': 40379260, 'epoch': '7.064'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4984/5680 [12:17:51<1:31:11,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4985/5680 [12:17:59<1:30:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4278', 'grad_norm': '0.3422', 'learning_rate': '7.318e-06', 'ppl': '1.534', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 40837120, 'tokens/trainable': 40387448, 'epoch': '7.064'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4985/5680 [12:17:59<1:30:58,  7.85s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4986/5680 [12:18:07<1:30:59,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4917', 'grad_norm': '0.4174', 'learning_rate': '7.298e-06', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 40845312, 'tokens/trainable': 40395604, 'epoch': '7.064'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4986/5680 [12:18:07<1:30:59,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4987/5680 [12:18:15<1:31:03,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4635', 'grad_norm': '0.4109', 'learning_rate': '7.277e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 40853504, 'tokens/trainable': 40403740, 'epoch': '7.065'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4987/5680 [12:18:15<1:31:03,  7.88s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4988/5680 [12:18:23<1:31:02,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4693', 'grad_norm': '0.4238', 'learning_rate': '7.256e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 40861696, 'tokens/trainable': 40411788, 'epoch': '7.065'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4988/5680 [12:18:23<1:31:02,  7.89s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4989/5680 [12:18:31<1:30:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.478', 'grad_norm': '0.4094', 'learning_rate': '7.236e-06', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 40869888, 'tokens/trainable': 40419932, 'epoch': '7.065'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4989/5680 [12:18:31<1:30:37,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4990/5680 [12:18:38<1:30:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6225', 'grad_norm': '0.481', 'learning_rate': '7.215e-06', 'ppl': '1.864', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 40878080, 'tokens/trainable': 40428096, 'epoch': '7.065'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4990/5680 [12:18:38<1:30:25,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4991/5680 [12:18:46<1:30:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3488', 'grad_norm': '0.4381', 'learning_rate': '7.194e-06', 'ppl': '1.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 40886272, 'tokens/trainable': 40436272, 'epoch': '7.065'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4991/5680 [12:18:46<1:30:21,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4992/5680 [12:18:54<1:30:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3375', 'grad_norm': '0.3809', 'learning_rate': '7.174e-06', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 40894464, 'tokens/trainable': 40444404, 'epoch': '7.065'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 4992/5680 [12:18:54<1:30:17,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4993/5680 [12:19:02<1:30:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4342', 'grad_norm': '0.3995', 'learning_rate': '7.153e-06', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40902656, 'tokens/trainable': 40452552, 'epoch': '7.066'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4993/5680 [12:19:02<1:30:09,  7.87s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4994/5680 [12:19:10<1:29:54,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3732', 'grad_norm': '0.3955', 'learning_rate': '7.133e-06', 'ppl': '1.452', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 40910848, 'tokens/trainable': 40460696, 'epoch': '7.066'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4994/5680 [12:19:10<1:29:54,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4995/5680 [12:19:18<1:29:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3633', 'grad_norm': '0.3781', 'learning_rate': '7.112e-06', 'ppl': '1.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 40919040, 'tokens/trainable': 40468868, 'epoch': '7.066'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4995/5680 [12:19:18<1:29:44,  7.86s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4996/5680 [12:19:26<1:29:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5875', 'grad_norm': '0.4243', 'learning_rate': '7.092e-06', 'ppl': '1.8', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 40927232, 'tokens/trainable': 40476956, 'epoch': '7.066'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 4996/5680 [12:19:26<1:29:31,  7.85s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 4997/5680 [12:19:34<1:29:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5206', 'grad_norm': '0.4127', 'learning_rate': '7.071e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 40935424, 'tokens/trainable': 40485084, 'epoch': '7.066'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 4997/5680 [12:19:34<1:29:27,  7.86s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 4998/5680 [12:19:41<1:29:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2438', 'grad_norm': '0.3905', 'learning_rate': '7.051e-06', 'ppl': '1.276', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 40943616, 'tokens/trainable': 40493260, 'epoch': '7.067'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 4998/5680 [12:19:41<1:29:25,  7.87s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 4999/5680 [12:19:49<1:29:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3024', 'grad_norm': '0.3678', 'learning_rate': '7.03e-06', 'ppl': '1.353', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40951808, 'tokens/trainable': 40501376, 'epoch': '7.067'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 4999/5680 [12:19:49<1:29:15,  7.86s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5000/5680 [12:19:57<1:29:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3843', 'grad_norm': '0.4509', 'learning_rate': '7.01e-06', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 40960000, 'tokens/trainable': 40509520, 'epoch': '7.067'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5000/5680 [12:19:57<1:29:13,  7.87s/it][2026-01-27 10:09:11,198] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 10:09:56,784] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out/checkpoint-5000
[2026-01-27 10:10:51,236] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 10:10:51,236] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5001/5680 [12:21:46<7:10:18, 38.02s/it]                                                                                                                                                                                                                                             {'loss': '0.4435', 'grad_norm': '0.3997', 'learning_rate': '6.99e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 40968192, 'tokens/trainable': 40517700, 'epoch': '7.067'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5001/5680 [12:21:46<7:10:18, 38.02s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5002/5680 [12:21:53<5:27:30, 28.98s/it]                                                                                                                                                                                                                                             {'loss': '0.6547', 'grad_norm': '0.4288', 'learning_rate': '6.97e-06', 'ppl': '1.925', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 40976384, 'tokens/trainable': 40525864, 'epoch': '7.067'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5002/5680 [12:21:53<5:27:30, 28.98s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5003/5680 [12:22:01<4:15:37, 22.66s/it]                                                                                                                                                                                                                                             {'loss': '0.2969', 'grad_norm': '0.3519', 'learning_rate': '6.949e-06', 'ppl': '1.346', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 40984576, 'tokens/trainable': 40534020, 'epoch': '7.067'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 5003/5680 [12:22:01<4:15:37, 22.66s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5004/5680 [12:22:09<3:25:16, 18.22s/it]                                                                                                                                                                                                                                             {'loss': '0.4742', 'grad_norm': '0.3927', 'learning_rate': '6.929e-06', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 40992768, 'tokens/trainable': 40542144, 'epoch': '7.068'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5004/5680 [12:22:09<3:25:16, 18.22s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5005/5680 [12:22:17<2:50:04, 15.12s/it]                                                                                                                                                                                                                                             {'loss': '0.4664', 'grad_norm': '0.3927', 'learning_rate': '6.909e-06', 'ppl': '1.594', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 41000960, 'tokens/trainable': 40550256, 'epoch': '7.068'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5005/5680 [12:22:17<2:50:04, 15.12s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5006/5680 [12:22:25<2:25:27, 12.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4539', 'grad_norm': '0.4592', 'learning_rate': '6.889e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41009152, 'tokens/trainable': 40558392, 'epoch': '7.068'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5006/5680 [12:22:25<2:25:27, 12.95s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5007/5680 [12:22:33<2:08:00, 11.41s/it]                                                                                                                                                                                                                                             {'loss': '0.4113', 'grad_norm': '0.5355', 'learning_rate': '6.868e-06', 'ppl': '1.509', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41017344, 'tokens/trainable': 40566536, 'epoch': '7.068'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 5007/5680 [12:22:33<2:08:00, 11.41s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 5008/5680 [12:22:41<1:55:57, 10.35s/it]                                                                                                                                                                                                                                             {'loss': '0.4315', 'grad_norm': '0.3609', 'learning_rate': '6.848e-06', 'ppl': '1.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41025536, 'tokens/trainable': 40574672, 'epoch': '7.068'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 5008/5680 [12:22:41<1:55:57, 10.35s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 5009/5680 [12:22:49<1:47:30,  9.61s/it]                                                                                                                                                                                                                                             {'loss': '0.4036', 'grad_norm': '0.4724', 'learning_rate': '6.828e-06', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41033728, 'tokens/trainable': 40582848, 'epoch': '7.068'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 5009/5680 [12:22:49<1:47:30,  9.61s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 5010/5680 [12:22:56<1:41:29,  9.09s/it]                                                                                                                                                                                                                                             {'loss': '0.4631', 'grad_norm': '0.4465', 'learning_rate': '6.808e-06', 'ppl': '1.589', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41041920, 'tokens/trainable': 40591004, 'epoch': '7.069'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 5010/5680 [12:22:56<1:41:29,  9.09s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5011/5680 [12:23:04<1:37:11,  8.72s/it]                                                                                                                                                                                                                                             {'loss': '0.5167', 'grad_norm': '0.4286', 'learning_rate': '6.788e-06', 'ppl': '1.676', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 41050112, 'tokens/trainable': 40599192, 'epoch': '7.069'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5011/5680 [12:23:04<1:37:11,  8.72s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5012/5680 [12:23:12<1:34:05,  8.45s/it]                                                                                                                                                                                                                                             {'loss': '0.4202', 'grad_norm': '0.3864', 'learning_rate': '6.768e-06', 'ppl': '1.522', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 41058304, 'tokens/trainable': 40607352, 'epoch': '7.069'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5012/5680 [12:23:12<1:34:05,  8.45s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5013/5680 [12:23:20<1:33:03,  8.37s/it]                                                                                                                                                                                                                                             {'loss': '0.2572', 'grad_norm': '0.356', 'learning_rate': '6.748e-06', 'ppl': '1.293', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.2', 'tokens/total': 41066496, 'tokens/trainable': 40615528, 'epoch': '7.069'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5013/5680 [12:23:20<1:33:03,  8.37s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5014/5680 [12:23:28<1:31:18,  8.23s/it]                                                                                                                                                                                                                                             {'loss': '0.4123', 'grad_norm': '0.4489', 'learning_rate': '6.728e-06', 'ppl': '1.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41074688, 'tokens/trainable': 40623704, 'epoch': '7.069'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 5014/5680 [12:23:28<1:31:18,  8.23s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5015/5680 [12:23:36<1:29:56,  8.12s/it]                                                                                                                                                                                                                                             {'loss': '0.4746', 'grad_norm': '0.4578', 'learning_rate': '6.708e-06', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 41082880, 'tokens/trainable': 40631856, 'epoch': '7.07'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5015/5680 [12:23:36<1:29:56,  8.12s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5016/5680 [12:23:44<1:28:48,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.4624', 'grad_norm': '0.4422', 'learning_rate': '6.688e-06', 'ppl': '1.588', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 41091072, 'tokens/trainable': 40640024, 'epoch': '7.07'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5016/5680 [12:23:44<1:28:48,  8.03s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5017/5680 [12:23:52<1:28:09,  7.98s/it]                                                                                                                                                                                                                                             {'loss': '0.4141', 'grad_norm': '0.4341', 'learning_rate': '6.668e-06', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41099264, 'tokens/trainable': 40648164, 'epoch': '7.07'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5017/5680 [12:23:52<1:28:09,  7.98s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5018/5680 [12:24:00<1:27:39,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4284', 'grad_norm': '0.3946', 'learning_rate': '6.649e-06', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 41107456, 'tokens/trainable': 40656336, 'epoch': '7.07'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 5018/5680 [12:24:00<1:27:39,  7.94s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5019/5680 [12:24:07<1:27:21,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.7133', 'grad_norm': '0.4231', 'learning_rate': '6.629e-06', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 41115648, 'tokens/trainable': 40664484, 'epoch': '7.07'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5019/5680 [12:24:07<1:27:21,  7.93s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5020/5680 [12:24:16<1:27:52,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.5468', 'grad_norm': '0.4854', 'learning_rate': '6.609e-06', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.2', 'tokens/total': 41123840, 'tokens/trainable': 40672604, 'epoch': '7.07'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5020/5680 [12:24:16<1:27:52,  7.99s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5021/5680 [12:24:23<1:27:19,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.4474', 'grad_norm': '0.3751', 'learning_rate': '6.589e-06', 'ppl': '1.564', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41132032, 'tokens/trainable': 40680720, 'epoch': '7.071'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5021/5680 [12:24:23<1:27:19,  7.95s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5022/5680 [12:24:31<1:27:00,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4095', 'grad_norm': '0.3767', 'learning_rate': '6.569e-06', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 41140224, 'tokens/trainable': 40688900, 'epoch': '7.071'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 5022/5680 [12:24:31<1:27:00,  7.93s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 5023/5680 [12:24:39<1:26:45,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5497', 'grad_norm': '0.4241', 'learning_rate': '6.55e-06', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41148416, 'tokens/trainable': 40697040, 'epoch': '7.071'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 5023/5680 [12:24:39<1:26:45,  7.92s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 5024/5680 [12:24:47<1:26:20,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5773', 'grad_norm': '0.4306', 'learning_rate': '6.53e-06', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 41156608, 'tokens/trainable': 40705132, 'epoch': '7.071'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 5024/5680 [12:24:47<1:26:20,  7.90s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 5025/5680 [12:24:55<1:26:13,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.414', 'grad_norm': '0.433', 'learning_rate': '6.51e-06', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41164800, 'tokens/trainable': 40713284, 'epoch': '7.071'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 5025/5680 [12:24:55<1:26:13,  7.90s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5026/5680 [12:25:03<1:25:57,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5799', 'grad_norm': '0.4914', 'learning_rate': '6.491e-06', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41172992, 'tokens/trainable': 40721464, 'epoch': '7.071'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5026/5680 [12:25:03<1:25:57,  7.89s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5027/5680 [12:25:11<1:25:53,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6473', 'grad_norm': '0.4407', 'learning_rate': '6.471e-06', 'ppl': '1.91', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 41181184, 'tokens/trainable': 40729544, 'epoch': '7.072'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5027/5680 [12:25:11<1:25:53,  7.89s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5028/5680 [12:25:19<1:25:46,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5527', 'grad_norm': '0.4952', 'learning_rate': '6.452e-06', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 41189376, 'tokens/trainable': 40737648, 'epoch': '7.072'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5028/5680 [12:25:19<1:25:46,  7.89s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5029/5680 [12:25:26<1:25:32,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5031', 'grad_norm': '0.4465', 'learning_rate': '6.432e-06', 'ppl': '1.654', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41197568, 'tokens/trainable': 40745752, 'epoch': '7.072'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 5029/5680 [12:25:26<1:25:32,  7.88s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5030/5680 [12:25:34<1:25:27,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6362', 'grad_norm': '0.5049', 'learning_rate': '6.413e-06', 'ppl': '1.889', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 41205760, 'tokens/trainable': 40753808, 'epoch': '7.072'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5030/5680 [12:25:34<1:25:27,  7.89s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5031/5680 [12:25:42<1:25:21,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3456', 'grad_norm': '0.3814', 'learning_rate': '6.393e-06', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41213952, 'tokens/trainable': 40761952, 'epoch': '7.072'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5031/5680 [12:25:42<1:25:21,  7.89s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5032/5680 [12:25:50<1:25:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5599', 'grad_norm': '0.4556', 'learning_rate': '6.374e-06', 'ppl': '1.75', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41222144, 'tokens/trainable': 40770100, 'epoch': '7.073'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5032/5680 [12:25:50<1:25:00,  7.87s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5033/5680 [12:25:58<1:24:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.35', 'grad_norm': '0.3501', 'learning_rate': '6.354e-06', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41230336, 'tokens/trainable': 40778192, 'epoch': '7.073'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 5033/5680 [12:25:58<1:24:44,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 5034/5680 [12:26:06<1:24:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.356', 'grad_norm': '0.4253', 'learning_rate': '6.335e-06', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41238528, 'tokens/trainable': 40786340, 'epoch': '7.073'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 5034/5680 [12:26:06<1:24:39,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 5035/5680 [12:26:14<1:24:39,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3245', 'grad_norm': '0.3858', 'learning_rate': '6.316e-06', 'ppl': '1.383', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 41246720, 'tokens/trainable': 40794500, 'epoch': '7.073'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 5035/5680 [12:26:14<1:24:39,  7.88s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 5036/5680 [12:26:22<1:24:20,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4204', 'grad_norm': '0.3995', 'learning_rate': '6.296e-06', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 41254912, 'tokens/trainable': 40802664, 'epoch': '7.073'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 5036/5680 [12:26:22<1:24:20,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5037/5680 [12:26:29<1:24:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5674', 'grad_norm': '0.5388', 'learning_rate': '6.277e-06', 'ppl': '1.764', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41263104, 'tokens/trainable': 40810820, 'epoch': '7.073'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5037/5680 [12:26:29<1:24:18,  7.87s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5038/5680 [12:26:37<1:24:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4424', 'grad_norm': '0.3912', 'learning_rate': '6.258e-06', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 41271296, 'tokens/trainable': 40818948, 'epoch': '7.074'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5038/5680 [12:26:37<1:24:11,  7.87s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5039/5680 [12:26:45<1:23:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5721', 'grad_norm': '0.4484', 'learning_rate': '6.238e-06', 'ppl': '1.772', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 41279488, 'tokens/trainable': 40827120, 'epoch': '7.074'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5039/5680 [12:26:45<1:23:58,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5040/5680 [12:26:53<1:23:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4037', 'grad_norm': '0.4029', 'learning_rate': '6.219e-06', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41287680, 'tokens/trainable': 40835264, 'epoch': '7.074'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 5040/5680 [12:26:53<1:23:53,  7.87s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5041/5680 [12:27:01<1:23:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6425', 'grad_norm': '0.4415', 'learning_rate': '6.2e-06', 'ppl': '1.901', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41295872, 'tokens/trainable': 40843420, 'epoch': '7.074'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5041/5680 [12:27:01<1:23:41,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5042/5680 [12:27:09<1:23:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3344', 'grad_norm': '0.3978', 'learning_rate': '6.181e-06', 'ppl': '1.397', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 41304064, 'tokens/trainable': 40851600, 'epoch': '7.074'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5042/5680 [12:27:09<1:23:31,  7.85s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5043/5680 [12:27:17<1:23:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5252', 'grad_norm': '0.4379', 'learning_rate': '6.162e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41312256, 'tokens/trainable': 40859724, 'epoch': '7.074'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5043/5680 [12:27:17<1:23:23,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5044/5680 [12:27:24<1:23:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5613', 'grad_norm': '0.4266', 'learning_rate': '6.143e-06', 'ppl': '1.753', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41320448, 'tokens/trainable': 40867848, 'epoch': '7.075'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 5044/5680 [12:27:24<1:23:12,  7.85s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5045/5680 [12:27:32<1:23:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6951', 'grad_norm': '0.4708', 'learning_rate': '6.124e-06', 'ppl': '2.004', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41328640, 'tokens/trainable': 40876016, 'epoch': '7.075'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5045/5680 [12:27:32<1:23:09,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5046/5680 [12:27:40<1:23:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3703', 'grad_norm': '0.437', 'learning_rate': '6.105e-06', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 41336832, 'tokens/trainable': 40884148, 'epoch': '7.075'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5046/5680 [12:27:40<1:23:11,  7.87s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5047/5680 [12:27:48<1:23:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.506', 'grad_norm': '0.4326', 'learning_rate': '6.086e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41345024, 'tokens/trainable': 40892320, 'epoch': '7.075'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5047/5680 [12:27:48<1:23:00,  7.87s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5048/5680 [12:27:56<1:22:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3051', 'grad_norm': '0.397', 'learning_rate': '6.067e-06', 'ppl': '1.357', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 41353216, 'tokens/trainable': 40900444, 'epoch': '7.075'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 5048/5680 [12:27:56<1:22:48,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 5049/5680 [12:28:04<1:22:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5587', 'grad_norm': '0.4322', 'learning_rate': '6.048e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 41361408, 'tokens/trainable': 40908592, 'epoch': '7.076'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 5049/5680 [12:28:04<1:22:37,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 5050/5680 [12:28:12<1:22:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6714', 'grad_norm': '0.5101', 'learning_rate': '6.029e-06', 'ppl': '1.957', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41369600, 'tokens/trainable': 40916720, 'epoch': '7.076'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 5050/5680 [12:28:12<1:22:26,  7.85s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 5051/5680 [12:28:19<1:22:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5316', 'grad_norm': '0.4875', 'learning_rate': '6.01e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41377792, 'tokens/trainable': 40924908, 'epoch': '7.076'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 5051/5680 [12:28:19<1:22:23,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5052/5680 [12:28:27<1:22:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4999', 'grad_norm': '0.48', 'learning_rate': '5.991e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41385984, 'tokens/trainable': 40933028, 'epoch': '7.076'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5052/5680 [12:28:27<1:22:15,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5053/5680 [12:28:35<1:22:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5276', 'grad_norm': '0.3973', 'learning_rate': '5.972e-06', 'ppl': '1.695', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41394176, 'tokens/trainable': 40941152, 'epoch': '7.076'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5053/5680 [12:28:35<1:22:09,  7.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5054/5680 [12:28:43<1:21:54,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5145', 'grad_norm': '0.4215', 'learning_rate': '5.953e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41402368, 'tokens/trainable': 40949288, 'epoch': '7.076'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5054/5680 [12:28:43<1:21:54,  7.85s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5055/5680 [12:28:51<1:21:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6707', 'grad_norm': '0.5189', 'learning_rate': '5.934e-06', 'ppl': '1.956', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 41410560, 'tokens/trainable': 40957432, 'epoch': '7.077'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 5055/5680 [12:28:51<1:21:44,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5056/5680 [12:28:59<1:21:33,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5661', 'grad_norm': '0.4104', 'learning_rate': '5.916e-06', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 41418752, 'tokens/trainable': 40965608, 'epoch': '7.077'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5056/5680 [12:28:59<1:21:33,  7.84s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5057/5680 [12:29:07<1:21:24,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5043', 'grad_norm': '0.452', 'learning_rate': '5.897e-06', 'ppl': '1.656', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41426944, 'tokens/trainable': 40973720, 'epoch': '7.077'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5057/5680 [12:29:07<1:21:24,  7.84s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5058/5680 [12:29:14<1:21:21,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3195', 'grad_norm': '0.4269', 'learning_rate': '5.878e-06', 'ppl': '1.376', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41435136, 'tokens/trainable': 40981832, 'epoch': '7.077'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5058/5680 [12:29:14<1:21:21,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5059/5680 [12:29:22<1:21:14,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4605', 'grad_norm': '0.5564', 'learning_rate': '5.86e-06', 'ppl': '1.585', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41443328, 'tokens/trainable': 40989948, 'epoch': '7.077'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 5059/5680 [12:29:22<1:21:14,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 5060/5680 [12:29:30<1:21:03,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.433', 'grad_norm': '0.4342', 'learning_rate': '5.841e-06', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 41451520, 'tokens/trainable': 40998080, 'epoch': '7.077'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 5060/5680 [12:29:30<1:21:03,  7.84s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 5061/5680 [12:29:38<1:20:56,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4245', 'grad_norm': '0.4445', 'learning_rate': '5.822e-06', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 41459712, 'tokens/trainable': 41006256, 'epoch': '7.078'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 5061/5680 [12:29:38<1:20:56,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 5062/5680 [12:29:46<1:20:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3712', 'grad_norm': '0.4535', 'learning_rate': '5.804e-06', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 41467904, 'tokens/trainable': 41014440, 'epoch': '7.078'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 5062/5680 [12:29:46<1:20:48,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5063/5680 [12:29:54<1:20:44,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5741', 'grad_norm': '0.4259', 'learning_rate': '5.785e-06', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41476096, 'tokens/trainable': 41022620, 'epoch': '7.078'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5063/5680 [12:29:54<1:20:44,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5064/5680 [12:30:01<1:20:33,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3992', 'grad_norm': '0.4024', 'learning_rate': '5.767e-06', 'ppl': '1.491', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 41484288, 'tokens/trainable': 41030780, 'epoch': '7.078'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5064/5680 [12:30:01<1:20:33,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5065/5680 [12:30:09<1:20:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.318', 'grad_norm': '0.3653', 'learning_rate': '5.748e-06', 'ppl': '1.374', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 41492480, 'tokens/trainable': 41038920, 'epoch': '7.078'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5065/5680 [12:30:09<1:20:37,  7.87s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5066/5680 [12:30:17<1:20:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4102', 'grad_norm': '0.4322', 'learning_rate': '5.73e-06', 'ppl': '1.507', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 41500672, 'tokens/trainable': 41047100, 'epoch': '7.079'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 5066/5680 [12:30:17<1:20:24,  7.86s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5067/5680 [12:30:25<1:20:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4164', 'grad_norm': '0.4403', 'learning_rate': '5.711e-06', 'ppl': '1.516', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41508864, 'tokens/trainable': 41055200, 'epoch': '7.079'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5067/5680 [12:30:25<1:20:17,  7.86s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5068/5680 [12:30:33<1:20:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6521', 'grad_norm': '0.4145', 'learning_rate': '5.693e-06', 'ppl': '1.92', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41517056, 'tokens/trainable': 41063360, 'epoch': '7.079'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5068/5680 [12:30:33<1:20:06,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5069/5680 [12:30:41<1:20:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7572', 'grad_norm': '0.4481', 'learning_rate': '5.674e-06', 'ppl': '2.132', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 41525248, 'tokens/trainable': 41071416, 'epoch': '7.079'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5069/5680 [12:30:41<1:20:01,  7.86s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5070/5680 [12:30:49<1:19:47,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4464', 'grad_norm': '0.4023', 'learning_rate': '5.656e-06', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 41533440, 'tokens/trainable': 41079584, 'epoch': '7.079'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 5070/5680 [12:30:49<1:19:47,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5071/5680 [12:30:56<1:19:37,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3888', 'grad_norm': '0.3963', 'learning_rate': '5.638e-06', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41541632, 'tokens/trainable': 41087736, 'epoch': '7.079'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5071/5680 [12:30:56<1:19:37,  7.84s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5072/5680 [12:31:04<1:19:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3974', 'grad_norm': '0.4506', 'learning_rate': '5.619e-06', 'ppl': '1.488', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 41549824, 'tokens/trainable': 41095836, 'epoch': '7.08'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5072/5680 [12:31:04<1:19:35,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5073/5680 [12:31:12<1:19:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5984', 'grad_norm': '0.4724', 'learning_rate': '5.601e-06', 'ppl': '1.819', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41558016, 'tokens/trainable': 41103952, 'epoch': '7.08'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5073/5680 [12:31:12<1:19:25,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5074/5680 [12:31:20<1:19:20,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4784', 'grad_norm': '0.4362', 'learning_rate': '5.583e-06', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 41566208, 'tokens/trainable': 41112104, 'epoch': '7.08'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 5074/5680 [12:31:20<1:19:20,  7.86s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 5075/5680 [12:31:28<1:19:59,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.4273', 'grad_norm': '0.4335', 'learning_rate': '5.565e-06', 'ppl': '1.533', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 41574400, 'tokens/trainable': 41120248, 'epoch': '7.08'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 5075/5680 [12:31:28<1:19:59,  7.93s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 5076/5680 [12:31:36<1:19:36,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4559', 'grad_norm': '0.4114', 'learning_rate': '5.547e-06', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41582592, 'tokens/trainable': 41128408, 'epoch': '7.08'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 5076/5680 [12:31:36<1:19:36,  7.91s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 5077/5680 [12:31:44<1:19:24,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5647', 'grad_norm': '0.4', 'learning_rate': '5.528e-06', 'ppl': '1.759', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41590784, 'tokens/trainable': 41136584, 'epoch': '7.08'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 5077/5680 [12:31:44<1:19:24,  7.90s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5078/5680 [12:31:52<1:19:04,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5211', 'grad_norm': '0.4335', 'learning_rate': '5.51e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41598976, 'tokens/trainable': 41144684, 'epoch': '7.081'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5078/5680 [12:31:52<1:19:04,  7.88s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5079/5680 [12:32:00<1:18:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.451', 'grad_norm': '0.4393', 'learning_rate': '5.492e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 41607168, 'tokens/trainable': 41152848, 'epoch': '7.081'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5079/5680 [12:32:00<1:18:52,  7.87s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5080/5680 [12:32:07<1:18:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5747', 'grad_norm': '0.4845', 'learning_rate': '5.474e-06', 'ppl': '1.777', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41615360, 'tokens/trainable': 41161032, 'epoch': '7.081'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5080/5680 [12:32:07<1:18:43,  7.87s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5081/5680 [12:32:15<1:18:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2742', 'grad_norm': '0.363', 'learning_rate': '5.456e-06', 'ppl': '1.315', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 41623552, 'tokens/trainable': 41169196, 'epoch': '7.081'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 5081/5680 [12:32:15<1:18:27,  7.86s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5082/5680 [12:32:23<1:18:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4688', 'grad_norm': '0.3889', 'learning_rate': '5.438e-06', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41631744, 'tokens/trainable': 41177360, 'epoch': '7.081'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5082/5680 [12:32:23<1:18:16,  7.85s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5083/5680 [12:32:31<1:18:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4137', 'grad_norm': '0.4311', 'learning_rate': '5.42e-06', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41639936, 'tokens/trainable': 41185524, 'epoch': '7.082'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5083/5680 [12:32:31<1:18:20,  7.87s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5084/5680 [12:32:39<1:18:23,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4126', 'grad_norm': '0.437', 'learning_rate': '5.402e-06', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 41648128, 'tokens/trainable': 41193604, 'epoch': '7.082'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5084/5680 [12:32:39<1:18:23,  7.89s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5085/5680 [12:32:47<1:18:08,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4009', 'grad_norm': '0.3972', 'learning_rate': '5.384e-06', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 41656320, 'tokens/trainable': 41201664, 'epoch': '7.082'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 5085/5680 [12:32:47<1:18:08,  7.88s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 5086/5680 [12:32:55<1:17:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5869', 'grad_norm': '0.4533', 'learning_rate': '5.366e-06', 'ppl': '1.798', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41664512, 'tokens/trainable': 41209764, 'epoch': '7.082'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 5086/5680 [12:32:55<1:17:52,  7.87s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 5087/5680 [12:33:02<1:17:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7127', 'grad_norm': '0.4449', 'learning_rate': '5.349e-06', 'ppl': '2.04', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 41672704, 'tokens/trainable': 41217928, 'epoch': '7.082'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 5087/5680 [12:33:02<1:17:37,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 5088/5680 [12:33:10<1:17:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4819', 'grad_norm': '0.4493', 'learning_rate': '5.331e-06', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 41680896, 'tokens/trainable': 41226040, 'epoch': '7.082'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 5088/5680 [12:33:10<1:17:25,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5089/5680 [12:33:18<1:17:17,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5285', 'grad_norm': '0.4706', 'learning_rate': '5.313e-06', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 41689088, 'tokens/trainable': 41234136, 'epoch': '7.083'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5089/5680 [12:33:18<1:17:17,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5090/5680 [12:33:26<1:17:05,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3985', 'grad_norm': '0.4426', 'learning_rate': '5.295e-06', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 41697280, 'tokens/trainable': 41242256, 'epoch': '7.083'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5090/5680 [12:33:26<1:17:05,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5091/5680 [12:33:34<1:17:02,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3648', 'grad_norm': '0.3773', 'learning_rate': '5.277e-06', 'ppl': '1.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41705472, 'tokens/trainable': 41250392, 'epoch': '7.083'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5091/5680 [12:33:34<1:17:02,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5092/5680 [12:33:42<1:16:51,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3722', 'grad_norm': '0.3736', 'learning_rate': '5.26e-06', 'ppl': '1.451', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 41713664, 'tokens/trainable': 41258516, 'epoch': '7.083'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 5092/5680 [12:33:42<1:16:51,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5093/5680 [12:33:50<1:16:46,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.368', 'grad_norm': '0.3881', 'learning_rate': '5.242e-06', 'ppl': '1.445', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 41721856, 'tokens/trainable': 41266584, 'epoch': '7.083'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5093/5680 [12:33:50<1:16:46,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5094/5680 [12:33:57<1:16:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3817', 'grad_norm': '0.4546', 'learning_rate': '5.224e-06', 'ppl': '1.465', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 41730048, 'tokens/trainable': 41274768, 'epoch': '7.083'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5094/5680 [12:33:57<1:16:36,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5095/5680 [12:34:05<1:16:34,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3939', 'grad_norm': '0.4328', 'learning_rate': '5.207e-06', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 41738240, 'tokens/trainable': 41282864, 'epoch': '7.084'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5095/5680 [12:34:05<1:16:34,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5096/5680 [12:34:13<1:16:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4083', 'grad_norm': '0.4301', 'learning_rate': '5.189e-06', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 41746432, 'tokens/trainable': 41291032, 'epoch': '7.084'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 5096/5680 [12:34:13<1:16:31,  7.86s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5097/5680 [12:34:21<1:16:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.1837', 'grad_norm': '0.3208', 'learning_rate': '5.172e-06', 'ppl': '1.202', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 41754624, 'tokens/trainable': 41299020, 'epoch': '7.084'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5097/5680 [12:34:21<1:16:25,  7.87s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5098/5680 [12:34:29<1:16:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4827', 'grad_norm': '0.4341', 'learning_rate': '5.154e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41762816, 'tokens/trainable': 41307176, 'epoch': '7.084'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5098/5680 [12:34:29<1:16:13,  7.86s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5099/5680 [12:34:37<1:16:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3839', 'grad_norm': '0.4437', 'learning_rate': '5.136e-06', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41771008, 'tokens/trainable': 41315316, 'epoch': '7.084'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5099/5680 [12:34:37<1:16:07,  7.86s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5100/5680 [12:34:45<1:16:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5302', 'grad_norm': '0.393', 'learning_rate': '5.119e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41779200, 'tokens/trainable': 41323472, 'epoch': '7.085'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 5100/5680 [12:34:45<1:16:02,  7.87s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 5101/5680 [12:34:52<1:15:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4583', 'grad_norm': '0.4378', 'learning_rate': '5.102e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41787392, 'tokens/trainable': 41331628, 'epoch': '7.085'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 5101/5680 [12:34:52<1:15:57,  7.87s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 5102/5680 [12:35:00<1:15:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3783', 'grad_norm': '0.4047', 'learning_rate': '5.084e-06', 'ppl': '1.46', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 41795584, 'tokens/trainable': 41339656, 'epoch': '7.085'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 5102/5680 [12:35:00<1:15:44,  7.86s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 5103/5680 [12:35:08<1:15:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.465', 'grad_norm': '0.4246', 'learning_rate': '5.067e-06', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 41803776, 'tokens/trainable': 41347768, 'epoch': '7.085'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 5103/5680 [12:35:08<1:15:43,  7.87s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5104/5680 [12:35:16<1:15:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4302', 'grad_norm': '0.4186', 'learning_rate': '5.049e-06', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 41811968, 'tokens/trainable': 41355880, 'epoch': '7.085'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5104/5680 [12:35:16<1:15:23,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5105/5680 [12:35:24<1:15:13,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4218', 'grad_norm': '0.4028', 'learning_rate': '5.032e-06', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41820160, 'tokens/trainable': 41363968, 'epoch': '7.085'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5105/5680 [12:35:24<1:15:13,  7.85s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5106/5680 [12:35:32<1:15:02,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5883', 'grad_norm': '0.4278', 'learning_rate': '5.015e-06', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 41828352, 'tokens/trainable': 41372092, 'epoch': '7.086'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5106/5680 [12:35:32<1:15:02,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5107/5680 [12:35:40<1:14:50,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5745', 'grad_norm': '0.4262', 'learning_rate': '4.997e-06', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 41836544, 'tokens/trainable': 41380264, 'epoch': '7.086'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5107/5680 [12:35:40<1:14:50,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5108/5680 [12:35:47<1:14:39,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5738', 'grad_norm': '0.4469', 'learning_rate': '4.98e-06', 'ppl': '1.775', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41844736, 'tokens/trainable': 41388328, 'epoch': '7.086'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5108/5680 [12:35:47<1:14:39,  7.83s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5109/5680 [12:35:55<1:14:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3602', 'grad_norm': '0.4213', 'learning_rate': '4.963e-06', 'ppl': '1.434', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 41852928, 'tokens/trainable': 41396456, 'epoch': '7.086'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5109/5680 [12:35:55<1:14:36,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5110/5680 [12:36:03<1:14:23,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4167', 'grad_norm': '0.4119', 'learning_rate': '4.946e-06', 'ppl': '1.517', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 41861120, 'tokens/trainable': 41404520, 'epoch': '7.086'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5110/5680 [12:36:03<1:14:23,  7.83s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5111/5680 [12:36:11<1:14:21,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5675', 'grad_norm': '0.4218', 'learning_rate': '4.929e-06', 'ppl': '1.764', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 41869312, 'tokens/trainable': 41412584, 'epoch': '7.086'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5111/5680 [12:36:11<1:14:21,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 5112/5680 [12:36:19<1:14:15,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5967', 'grad_norm': '0.452', 'learning_rate': '4.911e-06', 'ppl': '1.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 41877504, 'tokens/trainable': 41420744, 'epoch': '7.087'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 5112/5680 [12:36:19<1:14:15,  7.84s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 5113/5680 [12:36:27<1:15:02,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4249', 'grad_norm': '0.4501', 'learning_rate': '4.894e-06', 'ppl': '1.529', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '996', 'tokens/total': 41885696, 'tokens/trainable': 41428876, 'epoch': '7.087'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 5113/5680 [12:36:27<1:15:02,  7.94s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 5114/5680 [12:36:35<1:14:36,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4174', 'grad_norm': '0.4531', 'learning_rate': '4.877e-06', 'ppl': '1.518', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 41893888, 'tokens/trainable': 41437040, 'epoch': '7.087'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 5114/5680 [12:36:35<1:14:36,  7.91s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5115/5680 [12:36:43<1:14:18,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6422', 'grad_norm': '0.4955', 'learning_rate': '4.86e-06', 'ppl': '1.901', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 41902080, 'tokens/trainable': 41445228, 'epoch': '7.087'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5115/5680 [12:36:43<1:14:18,  7.89s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5116/5680 [12:36:50<1:14:03,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3084', 'grad_norm': '0.4062', 'learning_rate': '4.843e-06', 'ppl': '1.361', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 41910272, 'tokens/trainable': 41453400, 'epoch': '7.087'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5116/5680 [12:36:50<1:14:03,  7.88s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5117/5680 [12:36:58<1:13:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3367', 'grad_norm': '0.3571', 'learning_rate': '4.826e-06', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 41918464, 'tokens/trainable': 41461512, 'epoch': '7.088'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5117/5680 [12:36:58<1:13:47,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5118/5680 [12:37:06<1:13:41,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5793', 'grad_norm': '0.4449', 'learning_rate': '4.809e-06', 'ppl': '1.785', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 41926656, 'tokens/trainable': 41469624, 'epoch': '7.088'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 5118/5680 [12:37:06<1:13:41,  7.87s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5119/5680 [12:37:14<1:13:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5514', 'grad_norm': '0.451', 'learning_rate': '4.792e-06', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 41934848, 'tokens/trainable': 41477700, 'epoch': '7.088'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5119/5680 [12:37:14<1:13:28,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5120/5680 [12:37:22<1:13:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3236', 'grad_norm': '0.4085', 'learning_rate': '4.775e-06', 'ppl': '1.382', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 41943040, 'tokens/trainable': 41485652, 'epoch': '7.088'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5120/5680 [12:37:22<1:13:22,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5121/5680 [12:37:30<1:13:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2704', 'grad_norm': '0.3947', 'learning_rate': '4.759e-06', 'ppl': '1.311', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 41951232, 'tokens/trainable': 41493808, 'epoch': '7.088'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5121/5680 [12:37:30<1:13:14,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5122/5680 [12:37:38<1:13:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6072', 'grad_norm': '0.4211', 'learning_rate': '4.742e-06', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 41959424, 'tokens/trainable': 41501984, 'epoch': '7.088'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 5122/5680 [12:37:38<1:13:04,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5123/5680 [12:37:45<1:12:54,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5334', 'grad_norm': '0.4143', 'learning_rate': '4.725e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 41967616, 'tokens/trainable': 41510096, 'epoch': '7.089'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5123/5680 [12:37:45<1:12:54,  7.85s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5124/5680 [12:37:53<1:12:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4982', 'grad_norm': '0.4613', 'learning_rate': '4.708e-06', 'ppl': '1.646', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 41975808, 'tokens/trainable': 41518224, 'epoch': '7.089'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5124/5680 [12:37:53<1:12:49,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5125/5680 [12:38:01<1:12:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6166', 'grad_norm': '0.5236', 'learning_rate': '4.691e-06', 'ppl': '1.853', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 41984000, 'tokens/trainable': 41526192, 'epoch': '7.089'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5125/5680 [12:38:01<1:12:41,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5126/5680 [12:38:09<1:12:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3791', 'grad_norm': '0.4325', 'learning_rate': '4.675e-06', 'ppl': '1.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1017', 'tokens/total': 41992192, 'tokens/trainable': 41534168, 'epoch': '7.089'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5126/5680 [12:38:09<1:12:30,  7.85s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 5127/5680 [12:38:17<1:12:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5956', 'grad_norm': '0.4604', 'learning_rate': '4.658e-06', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 42000384, 'tokens/trainable': 41542336, 'epoch': '7.089'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 5127/5680 [12:38:17<1:12:22,  7.85s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 5128/5680 [12:38:25<1:12:17,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4087', 'grad_norm': '0.4593', 'learning_rate': '4.641e-06', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 42008576, 'tokens/trainable': 41550312, 'epoch': '7.089'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 5128/5680 [12:38:25<1:12:17,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 5129/5680 [12:38:32<1:12:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4919', 'grad_norm': '0.4374', 'learning_rate': '4.625e-06', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 42016768, 'tokens/trainable': 41558432, 'epoch': '7.09'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 5129/5680 [12:38:32<1:12:03,  7.85s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5130/5680 [12:38:40<1:11:51,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3886', 'grad_norm': '0.4531', 'learning_rate': '4.608e-06', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.7', 'tokens/total': 42024960, 'tokens/trainable': 41566192, 'epoch': '7.09'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5130/5680 [12:38:40<1:11:51,  7.84s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5131/5680 [12:38:48<1:11:43,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5227', 'grad_norm': '0.4468', 'learning_rate': '4.591e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 42033152, 'tokens/trainable': 41574272, 'epoch': '7.09'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5131/5680 [12:38:48<1:11:43,  7.84s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5132/5680 [12:38:56<1:11:40,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5041', 'grad_norm': '0.4125', 'learning_rate': '4.575e-06', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 42041344, 'tokens/trainable': 41582220, 'epoch': '7.09'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5132/5680 [12:38:56<1:11:40,  7.85s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5133/5680 [12:39:04<1:11:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.311', 'grad_norm': '0.397', 'learning_rate': '4.558e-06', 'ppl': '1.365', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 42049536, 'tokens/trainable': 41590216, 'epoch': '7.09'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 5133/5680 [12:39:04<1:11:38,  7.86s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5134/5680 [12:39:12<1:12:17,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4553', 'grad_norm': '0.4583', 'learning_rate': '4.542e-06', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '978.1', 'tokens/total': 42057728, 'tokens/trainable': 41598176, 'epoch': '7.09'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5134/5680 [12:39:12<1:12:17,  7.94s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5135/5680 [12:39:20<1:11:57,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.3699', 'grad_norm': '0.5251', 'learning_rate': '4.525e-06', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 42065920, 'tokens/trainable': 41606104, 'epoch': '7.091'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5135/5680 [12:39:20<1:11:57,  7.92s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5136/5680 [12:39:28<1:11:40,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3003', 'grad_norm': '0.4749', 'learning_rate': '4.509e-06', 'ppl': '1.35', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 42074112, 'tokens/trainable': 41614056, 'epoch': '7.091'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5136/5680 [12:39:28<1:11:40,  7.91s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5137/5680 [12:39:36<1:11:29,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4928', 'grad_norm': '0.4873', 'learning_rate': '4.493e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.9', 'tokens/total': 42082304, 'tokens/trainable': 41621864, 'epoch': '7.091'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5137/5680 [12:39:36<1:11:29,  7.90s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 5138/5680 [12:39:44<1:11:17,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.246', 'grad_norm': '0.3945', 'learning_rate': '4.476e-06', 'ppl': '1.279', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 42090496, 'tokens/trainable': 41629744, 'epoch': '7.091'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 5138/5680 [12:39:44<1:11:17,  7.89s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 5139/5680 [12:39:51<1:11:08,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5697', 'grad_norm': '0.438', 'learning_rate': '4.46e-06', 'ppl': '1.768', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 42098688, 'tokens/trainable': 41637856, 'epoch': '7.091'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 5139/5680 [12:39:51<1:11:08,  7.89s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 5140/5680 [12:39:59<1:10:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5108', 'grad_norm': '0.4579', 'learning_rate': '4.443e-06', 'ppl': '1.667', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 42106880, 'tokens/trainable': 41645880, 'epoch': '7.092'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 5140/5680 [12:39:59<1:10:56,  7.88s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5141/5680 [12:40:07<1:10:47,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.1902', 'grad_norm': '0.3623', 'learning_rate': '4.427e-06', 'ppl': '1.21', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.7', 'tokens/total': 42115072, 'tokens/trainable': 41653752, 'epoch': '7.092'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5141/5680 [12:40:07<1:10:47,  7.88s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5142/5680 [12:40:15<1:10:44,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2656', 'grad_norm': '0.3633', 'learning_rate': '4.411e-06', 'ppl': '1.304', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 42123264, 'tokens/trainable': 41661748, 'epoch': '7.092'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5142/5680 [12:40:15<1:10:44,  7.89s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5143/5680 [12:40:23<1:10:35,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3581', 'grad_norm': '0.375', 'learning_rate': '4.395e-06', 'ppl': '1.431', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 42131456, 'tokens/trainable': 41669844, 'epoch': '7.092'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5143/5680 [12:40:23<1:10:35,  7.89s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5144/5680 [12:40:31<1:10:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4996', 'grad_norm': '0.4649', 'learning_rate': '4.379e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 42139648, 'tokens/trainable': 41677868, 'epoch': '7.092'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 5144/5680 [12:40:31<1:10:21,  7.88s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5145/5680 [12:40:39<1:10:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5746', 'grad_norm': '0.4088', 'learning_rate': '4.362e-06', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 42147840, 'tokens/trainable': 41685824, 'epoch': '7.092'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5145/5680 [12:40:39<1:10:11,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5146/5680 [12:40:46<1:09:56,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5148', 'grad_norm': '0.4395', 'learning_rate': '4.346e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 42156032, 'tokens/trainable': 41693868, 'epoch': '7.093'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5146/5680 [12:40:47<1:09:56,  7.86s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5147/5680 [12:40:54<1:09:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.498', 'grad_norm': '0.4657', 'learning_rate': '4.33e-06', 'ppl': '1.645', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 42164224, 'tokens/trainable': 41702028, 'epoch': '7.093'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5147/5680 [12:40:54<1:09:51,  7.86s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5148/5680 [12:41:02<1:09:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4406', 'grad_norm': '0.3885', 'learning_rate': '4.314e-06', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 42172416, 'tokens/trainable': 41709988, 'epoch': '7.093'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 5148/5680 [12:41:02<1:09:47,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5149/5680 [12:41:10<1:09:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7434', 'grad_norm': '0.511', 'learning_rate': '4.298e-06', 'ppl': '2.103', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 42180608, 'tokens/trainable': 41718028, 'epoch': '7.093'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5149/5680 [12:41:10<1:09:39,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5150/5680 [12:41:18<1:09:29,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3495', 'grad_norm': '0.3976', 'learning_rate': '4.282e-06', 'ppl': '1.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 42188800, 'tokens/trainable': 41725996, 'epoch': '7.093'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5150/5680 [12:41:18<1:09:29,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5151/5680 [12:41:26<1:09:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4094', 'grad_norm': '0.4472', 'learning_rate': '4.266e-06', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 42196992, 'tokens/trainable': 41734176, 'epoch': '7.093'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5151/5680 [12:41:26<1:09:18,  7.86s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5152/5680 [12:41:34<1:09:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3672', 'grad_norm': '0.4263', 'learning_rate': '4.25e-06', 'ppl': '1.444', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 42205184, 'tokens/trainable': 41742216, 'epoch': '7.094'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 5152/5680 [12:41:34<1:09:08,  7.86s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 5153/5680 [12:41:42<1:09:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4408', 'grad_norm': '0.4502', 'learning_rate': '4.234e-06', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 42213376, 'tokens/trainable': 41750256, 'epoch': '7.094'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 5153/5680 [12:41:42<1:09:05,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 5154/5680 [12:41:49<1:08:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4405', 'grad_norm': '0.3999', 'learning_rate': '4.218e-06', 'ppl': '1.554', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 42221568, 'tokens/trainable': 41758376, 'epoch': '7.094'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 5154/5680 [12:41:49<1:08:53,  7.86s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 5155/5680 [12:41:57<1:08:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6413', 'grad_norm': '0.4414', 'learning_rate': '4.202e-06', 'ppl': '1.899', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 42229760, 'tokens/trainable': 41766516, 'epoch': '7.094'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 5155/5680 [12:41:57<1:08:43,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5156/5680 [12:42:05<1:08:43,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2464', 'grad_norm': '0.4143', 'learning_rate': '4.186e-06', 'ppl': '1.279', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 42237952, 'tokens/trainable': 41774604, 'epoch': '7.094'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5156/5680 [12:42:05<1:08:43,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5157/5680 [12:42:13<1:08:36,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4014', 'grad_norm': '0.4028', 'learning_rate': '4.171e-06', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1006', 'tokens/total': 42246144, 'tokens/trainable': 41782524, 'epoch': '7.095'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5157/5680 [12:42:13<1:08:36,  7.87s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5158/5680 [12:42:21<1:08:18,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5518', 'grad_norm': '0.5604', 'learning_rate': '4.155e-06', 'ppl': '1.736', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 42254336, 'tokens/trainable': 41790496, 'epoch': '7.095'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5158/5680 [12:42:21<1:08:18,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5159/5680 [12:42:29<1:08:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3686', 'grad_norm': '0.3766', 'learning_rate': '4.139e-06', 'ppl': '1.446', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 42262528, 'tokens/trainable': 41798560, 'epoch': '7.095'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 5159/5680 [12:42:29<1:08:07,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5160/5680 [12:42:37<1:08:02,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3061', 'grad_norm': '0.4673', 'learning_rate': '4.123e-06', 'ppl': '1.358', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 42270720, 'tokens/trainable': 41806680, 'epoch': '7.095'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5160/5680 [12:42:37<1:08:02,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5161/5680 [12:42:44<1:07:54,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5712', 'grad_norm': '0.4509', 'learning_rate': '4.108e-06', 'ppl': '1.77', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.1', 'tokens/total': 42278912, 'tokens/trainable': 41814428, 'epoch': '7.095'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5161/5680 [12:42:44<1:07:54,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5162/5680 [12:42:52<1:07:45,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.323', 'grad_norm': '0.4833', 'learning_rate': '4.092e-06', 'ppl': '1.381', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 42287104, 'tokens/trainable': 41822432, 'epoch': '7.095'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5162/5680 [12:42:52<1:07:45,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5163/5680 [12:43:00<1:07:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5746', 'grad_norm': '0.4378', 'learning_rate': '4.076e-06', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '995.7', 'tokens/total': 42295296, 'tokens/trainable': 41830288, 'epoch': '7.096'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 5163/5680 [12:43:00<1:07:44,  7.86s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5164/5680 [12:43:08<1:07:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4178', 'grad_norm': '0.4323', 'learning_rate': '4.061e-06', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1003', 'tokens/total': 42303488, 'tokens/trainable': 41838120, 'epoch': '7.096'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5164/5680 [12:43:08<1:07:29,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5165/5680 [12:43:16<1:07:17,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5611', 'grad_norm': '0.4671', 'learning_rate': '4.045e-06', 'ppl': '1.753', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 42311680, 'tokens/trainable': 41846148, 'epoch': '7.096'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5165/5680 [12:43:16<1:07:17,  7.84s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5166/5680 [12:43:24<1:07:07,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7915', 'grad_norm': '0.4834', 'learning_rate': '4.029e-06', 'ppl': '2.207', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 42319872, 'tokens/trainable': 41854068, 'epoch': '7.096'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5166/5680 [12:43:24<1:07:07,  7.84s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5167/5680 [12:43:31<1:07:02,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5968', 'grad_norm': '0.4278', 'learning_rate': '4.014e-06', 'ppl': '1.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 42328064, 'tokens/trainable': 41862136, 'epoch': '7.096'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 5167/5680 [12:43:31<1:07:02,  7.84s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 5168/5680 [12:43:39<1:06:50,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.5102', 'grad_norm': '0.5184', 'learning_rate': '3.998e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 42336256, 'tokens/trainable': 41870176, 'epoch': '7.096'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 5168/5680 [12:43:39<1:06:50,  7.83s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 5169/5680 [12:43:47<1:06:47,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3535', 'grad_norm': '0.3821', 'learning_rate': '3.983e-06', 'ppl': '1.424', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 42344448, 'tokens/trainable': 41878256, 'epoch': '7.097'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 5169/5680 [12:43:47<1:06:47,  7.84s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 5170/5680 [12:43:55<1:06:39,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.45', 'grad_norm': '0.4282', 'learning_rate': '3.968e-06', 'ppl': '1.568', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 42352640, 'tokens/trainable': 41886336, 'epoch': '7.097'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 5170/5680 [12:43:55<1:06:39,  7.84s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5171/5680 [12:44:03<1:06:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5395', 'grad_norm': '0.4354', 'learning_rate': '3.952e-06', 'ppl': '1.715', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 42360832, 'tokens/trainable': 41894440, 'epoch': '7.097'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5171/5680 [12:44:03<1:06:35,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5172/5680 [12:44:11<1:06:23,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5617', 'grad_norm': '0.4692', 'learning_rate': '3.937e-06', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 42369024, 'tokens/trainable': 41902352, 'epoch': '7.097'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5172/5680 [12:44:11<1:06:23,  7.84s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5173/5680 [12:44:19<1:06:17,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7648', 'grad_norm': '0.4686', 'learning_rate': '3.921e-06', 'ppl': '2.148', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 42377216, 'tokens/trainable': 41910316, 'epoch': '7.097'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5173/5680 [12:44:19<1:06:17,  7.85s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5174/5680 [12:44:26<1:06:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6207', 'grad_norm': '0.4738', 'learning_rate': '3.906e-06', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 42385408, 'tokens/trainable': 41918232, 'epoch': '7.098'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5174/5680 [12:44:26<1:06:18,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5175/5680 [12:44:34<1:06:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5966', 'grad_norm': '0.4725', 'learning_rate': '3.891e-06', 'ppl': '1.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 42393600, 'tokens/trainable': 41926072, 'epoch': '7.098'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5175/5680 [12:44:34<1:06:07,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5176/5680 [12:44:42<1:06:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.1897', 'grad_norm': '0.3828', 'learning_rate': '3.876e-06', 'ppl': '1.209', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 42401792, 'tokens/trainable': 41934056, 'epoch': '7.098'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5176/5680 [12:44:42<1:06:01,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5177/5680 [12:44:50<1:06:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.422', 'grad_norm': '0.4548', 'learning_rate': '3.86e-06', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 42409984, 'tokens/trainable': 41942028, 'epoch': '7.098'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5177/5680 [12:44:50<1:06:00,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5178/5680 [12:44:58<1:05:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4737', 'grad_norm': '0.4664', 'learning_rate': '3.845e-06', 'ppl': '1.606', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 42418176, 'tokens/trainable': 41949972, 'epoch': '7.098'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 5178/5680 [12:44:58<1:05:46,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 5179/5680 [12:45:06<1:05:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5643', 'grad_norm': '0.4376', 'learning_rate': '3.83e-06', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 42426368, 'tokens/trainable': 41958112, 'epoch': '7.098'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 5179/5680 [12:45:06<1:05:50,  7.89s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 5180/5680 [12:45:14<1:05:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4011', 'grad_norm': '0.453', 'learning_rate': '3.815e-06', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 42434560, 'tokens/trainable': 41966160, 'epoch': '7.099'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 5180/5680 [12:45:14<1:05:37,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 5181/5680 [12:45:21<1:05:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4658', 'grad_norm': '0.3803', 'learning_rate': '3.8e-06', 'ppl': '1.593', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1020', 'tokens/total': 42442752, 'tokens/trainable': 41974152, 'epoch': '7.099'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 5181/5680 [12:45:21<1:05:23,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5182/5680 [12:45:29<1:05:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6205', 'grad_norm': '0.4082', 'learning_rate': '3.785e-06', 'ppl': '1.86', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 42450944, 'tokens/trainable': 41982264, 'epoch': '7.099'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5182/5680 [12:45:29<1:05:17,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5183/5680 [12:45:37<1:05:10,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3151', 'grad_norm': '0.3583', 'learning_rate': '3.77e-06', 'ppl': '1.37', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 42459136, 'tokens/trainable': 41990240, 'epoch': '7.099'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5183/5680 [12:45:37<1:05:10,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5184/5680 [12:45:45<1:05:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4894', 'grad_norm': '0.4355', 'learning_rate': '3.754e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 42467328, 'tokens/trainable': 41998152, 'epoch': '7.099'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5184/5680 [12:45:45<1:05:03,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5185/5680 [12:45:53<1:04:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3618', 'grad_norm': '0.3975', 'learning_rate': '3.739e-06', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 42475520, 'tokens/trainable': 42006212, 'epoch': '7.099'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 5185/5680 [12:45:53<1:04:51,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5186/5680 [12:46:01<1:04:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3523', 'grad_norm': '0.3871', 'learning_rate': '3.725e-06', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 42483712, 'tokens/trainable': 42014152, 'epoch': '7.1'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5186/5680 [12:46:01<1:04:46,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5187/5680 [12:46:09<1:04:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3404', 'grad_norm': '0.3749', 'learning_rate': '3.71e-06', 'ppl': '1.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '985.3', 'tokens/total': 42491904, 'tokens/trainable': 42021876, 'epoch': '7.1'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5187/5680 [12:46:09<1:04:34,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5188/5680 [12:46:17<1:04:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5403', 'grad_norm': '0.4279', 'learning_rate': '3.695e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 42500096, 'tokens/trainable': 42029808, 'epoch': '7.1'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5188/5680 [12:46:17<1:04:26,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5189/5680 [12:46:24<1:04:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4353', 'grad_norm': '0.4436', 'learning_rate': '3.68e-06', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1015', 'tokens/total': 42508288, 'tokens/trainable': 42037800, 'epoch': '7.1'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 5189/5680 [12:46:24<1:04:21,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5190/5680 [12:46:32<1:04:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7014', 'grad_norm': '0.5112', 'learning_rate': '3.665e-06', 'ppl': '2.017', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 42516480, 'tokens/trainable': 42045964, 'epoch': '7.1'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5190/5680 [12:46:32<1:04:10,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5191/5680 [12:46:40<1:04:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4441', 'grad_norm': '0.4205', 'learning_rate': '3.65e-06', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 42524672, 'tokens/trainable': 42053848, 'epoch': '7.101'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5191/5680 [12:46:40<1:04:06,  7.87s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5192/5680 [12:46:48<1:03:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5248', 'grad_norm': '0.4238', 'learning_rate': '3.635e-06', 'ppl': '1.69', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 42532864, 'tokens/trainable': 42061784, 'epoch': '7.101'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5192/5680 [12:46:48<1:03:51,  7.85s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5193/5680 [12:46:56<1:03:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5971', 'grad_norm': '0.4192', 'learning_rate': '3.621e-06', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 42541056, 'tokens/trainable': 42069936, 'epoch': '7.101'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 5193/5680 [12:46:56<1:03:42,  7.85s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 5194/5680 [12:47:04<1:03:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3664', 'grad_norm': '0.4252', 'learning_rate': '3.606e-06', 'ppl': '1.443', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '993.7', 'tokens/total': 42549248, 'tokens/trainable': 42077768, 'epoch': '7.101'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 5194/5680 [12:47:04<1:03:39,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 5195/5680 [12:47:12<1:03:33,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5442', 'grad_norm': '0.4555', 'learning_rate': '3.591e-06', 'ppl': '1.723', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 42557440, 'tokens/trainable': 42085784, 'epoch': '7.101'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 5195/5680 [12:47:12<1:03:33,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 5196/5680 [12:47:19<1:03:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3825', 'grad_norm': '0.4166', 'learning_rate': '3.576e-06', 'ppl': '1.466', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 42565632, 'tokens/trainable': 42093808, 'epoch': '7.101'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 5196/5680 [12:47:19<1:03:24,  7.86s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5197/5680 [12:47:27<1:03:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3179', 'grad_norm': '0.3558', 'learning_rate': '3.562e-06', 'ppl': '1.374', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1016', 'tokens/total': 42573824, 'tokens/trainable': 42101788, 'epoch': '7.102'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5197/5680 [12:47:27<1:03:16,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5198/5680 [12:47:35<1:03:56,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.2691', 'grad_norm': '0.3769', 'learning_rate': '3.547e-06', 'ppl': '1.309', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '949.7', 'tokens/total': 42582016, 'tokens/trainable': 42109568, 'epoch': '7.102'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5198/5680 [12:47:35<1:03:56,  7.96s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5199/5680 [12:47:43<1:03:30,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5188', 'grad_norm': '0.4103', 'learning_rate': '3.533e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 42590208, 'tokens/trainable': 42117672, 'epoch': '7.102'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5199/5680 [12:47:43<1:03:30,  7.92s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5200/5680 [12:47:51<1:03:12,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3758', 'grad_norm': '0.4315', 'learning_rate': '3.518e-06', 'ppl': '1.456', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 42598400, 'tokens/trainable': 42125832, 'epoch': '7.102'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 5200/5680 [12:47:51<1:03:12,  7.90s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5201/5680 [12:47:59<1:02:59,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2428', 'grad_norm': '0.4096', 'learning_rate': '3.504e-06', 'ppl': '1.275', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.2', 'tokens/total': 42606592, 'tokens/trainable': 42133616, 'epoch': '7.102'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5201/5680 [12:47:59<1:02:59,  7.89s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5202/5680 [12:48:07<1:02:48,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4848', 'grad_norm': '0.3911', 'learning_rate': '3.489e-06', 'ppl': '1.624', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 42614784, 'tokens/trainable': 42141792, 'epoch': '7.102'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5202/5680 [12:48:07<1:02:48,  7.88s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5203/5680 [12:48:15<1:02:43,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2826', 'grad_norm': '0.4118', 'learning_rate': '3.475e-06', 'ppl': '1.327', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1013', 'tokens/total': 42622976, 'tokens/trainable': 42149796, 'epoch': '7.103'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5203/5680 [12:48:15<1:02:43,  7.89s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5204/5680 [12:48:23<1:02:36,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3754', 'grad_norm': '0.4225', 'learning_rate': '3.46e-06', 'ppl': '1.456', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 42631168, 'tokens/trainable': 42157956, 'epoch': '7.103'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 5204/5680 [12:48:23<1:02:36,  7.89s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 5205/5680 [12:48:31<1:02:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.29', 'grad_norm': '0.4357', 'learning_rate': '3.446e-06', 'ppl': '1.336', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 42639360, 'tokens/trainable': 42165976, 'epoch': '7.103'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 5205/5680 [12:48:31<1:02:23,  7.88s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 5206/5680 [12:48:38<1:02:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4456', 'grad_norm': '0.3947', 'learning_rate': '3.431e-06', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '988.1', 'tokens/total': 42647552, 'tokens/trainable': 42173720, 'epoch': '7.103'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 5206/5680 [12:48:38<1:02:09,  7.87s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 5207/5680 [12:48:46<1:02:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3849', 'grad_norm': '0.3799', 'learning_rate': '3.417e-06', 'ppl': '1.469', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 42655744, 'tokens/trainable': 42181856, 'epoch': '7.103'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 5207/5680 [12:48:46<1:02:02,  7.87s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5208/5680 [12:48:54<1:01:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3434', 'grad_norm': '0.3605', 'learning_rate': '3.403e-06', 'ppl': '1.41', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 42663936, 'tokens/trainable': 42189884, 'epoch': '7.104'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5208/5680 [12:48:54<1:01:52,  7.87s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5209/5680 [12:49:02<1:01:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5671', 'grad_norm': '0.4904', 'learning_rate': '3.388e-06', 'ppl': '1.763', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '972.5', 'tokens/total': 42672128, 'tokens/trainable': 42197524, 'epoch': '7.104'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5209/5680 [12:49:02<1:01:43,  7.86s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5210/5680 [12:49:10<1:01:34,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7989', 'grad_norm': '0.4382', 'learning_rate': '3.374e-06', 'ppl': '2.223', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 42680320, 'tokens/trainable': 42205616, 'epoch': '7.104'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5210/5680 [12:49:10<1:01:34,  7.86s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5211/5680 [12:49:18<1:01:35,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3205', 'grad_norm': '0.4234', 'learning_rate': '3.36e-06', 'ppl': '1.378', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 42688512, 'tokens/trainable': 42213804, 'epoch': '7.104'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 5211/5680 [12:49:18<1:01:35,  7.88s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5212/5680 [12:49:26<1:01:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4436', 'grad_norm': '0.4252', 'learning_rate': '3.346e-06', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 42696704, 'tokens/trainable': 42221956, 'epoch': '7.104'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5212/5680 [12:49:26<1:01:24,  7.87s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5213/5680 [12:49:34<1:01:53,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.2406', 'grad_norm': '0.3374', 'learning_rate': '3.331e-06', 'ppl': '1.272', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '997.3', 'tokens/total': 42704896, 'tokens/trainable': 42230072, 'epoch': '7.104'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5213/5680 [12:49:34<1:01:53,  7.95s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5214/5680 [12:49:42<1:01:35,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3234', 'grad_norm': '0.3969', 'learning_rate': '3.317e-06', 'ppl': '1.382', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.9', 'tokens/total': 42713088, 'tokens/trainable': 42237840, 'epoch': '7.105'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5214/5680 [12:49:42<1:01:35,  7.93s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5215/5680 [12:49:49<1:01:18,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.3774', 'learning_rate': '3.303e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 42721280, 'tokens/trainable': 42245784, 'epoch': '7.105'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 5215/5680 [12:49:49<1:01:18,  7.91s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5216/5680 [12:49:57<1:01:02,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4765', 'grad_norm': '0.4921', 'learning_rate': '3.289e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 42729472, 'tokens/trainable': 42253676, 'epoch': '7.105'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5216/5680 [12:49:57<1:01:02,  7.89s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5217/5680 [12:50:05<1:00:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.2671', 'grad_norm': '0.3885', 'learning_rate': '3.275e-06', 'ppl': '1.306', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 42737664, 'tokens/trainable': 42261568, 'epoch': '7.105'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5217/5680 [12:50:05<1:00:49,  7.88s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5218/5680 [12:50:13<1:00:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6669', 'grad_norm': '0.4748', 'learning_rate': '3.261e-06', 'ppl': '1.948', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 42745856, 'tokens/trainable': 42269712, 'epoch': '7.105'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5218/5680 [12:50:13<1:00:40,  7.88s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5219/5680 [12:50:21<1:00:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5104', 'grad_norm': '0.4191', 'learning_rate': '3.247e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 42754048, 'tokens/trainable': 42277552, 'epoch': '7.105'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5219/5680 [12:50:21<1:00:25,  7.86s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5220/5680 [12:50:29<1:00:13,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3524', 'grad_norm': '0.3679', 'learning_rate': '3.233e-06', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 42762240, 'tokens/trainable': 42285472, 'epoch': '7.106'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5220/5680 [12:50:29<1:00:13,  7.85s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5221/5680 [12:50:37<1:00:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3283', 'grad_norm': '0.372', 'learning_rate': '3.219e-06', 'ppl': '1.389', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '987.6', 'tokens/total': 42770432, 'tokens/trainable': 42293232, 'epoch': '7.106'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5221/5680 [12:50:37<1:00:06,  7.86s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5222/5680 [12:50:44<1:00:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3166', 'grad_norm': '0.3489', 'learning_rate': '3.205e-06', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '986.7', 'tokens/total': 42778624, 'tokens/trainable': 42301008, 'epoch': '7.106'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5222/5680 [12:50:44<1:00:02,  7.86s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5223/5680 [12:50:52<59:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3644', 'grad_norm': '0.4318', 'learning_rate': '3.191e-06', 'ppl': '1.44', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 42786816, 'tokens/trainable': 42309088, 'epoch': '7.106'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 5223/5680 [12:50:52<59:51,  7.86s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5224/5680 [12:51:00<59:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.451', 'grad_norm': '0.4318', 'learning_rate': '3.178e-06', 'ppl': '1.57', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 42795008, 'tokens/trainable': 42317040, 'epoch': '7.106'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5224/5680 [12:51:00<59:36,  7.84s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5225/5680 [12:51:08<59:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3856', 'grad_norm': '0.4227', 'learning_rate': '3.164e-06', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1007', 'tokens/total': 42803200, 'tokens/trainable': 42324952, 'epoch': '7.107'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5225/5680 [12:51:08<59:30,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5226/5680 [12:51:16<59:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6622', 'grad_norm': '0.403', 'learning_rate': '3.15e-06', 'ppl': '1.939', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 42811392, 'tokens/trainable': 42333032, 'epoch': '7.107'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5226/5680 [12:51:16<59:25,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5227/5680 [12:51:24<59:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5138', 'grad_norm': '0.4878', 'learning_rate': '3.136e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1024', 'tokens/total': 42819584, 'tokens/trainable': 42341056, 'epoch': '7.107'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 5227/5680 [12:51:24<59:15,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5228/5680 [12:51:32<59:08,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4048', 'grad_norm': '0.3969', 'learning_rate': '3.122e-06', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.2', 'tokens/total': 42827776, 'tokens/trainable': 42348832, 'epoch': '7.107'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5228/5680 [12:51:32<59:08,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5229/5680 [12:51:39<58:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4779', 'grad_norm': '0.5136', 'learning_rate': '3.109e-06', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1008', 'tokens/total': 42835968, 'tokens/trainable': 42356728, 'epoch': '7.107'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5229/5680 [12:51:39<58:58,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5230/5680 [12:51:47<58:49,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.7134', 'grad_norm': '0.6378', 'learning_rate': '3.095e-06', 'ppl': '2.041', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 42844160, 'tokens/trainable': 42364760, 'epoch': '7.107'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5230/5680 [12:51:47<58:49,  7.84s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5231/5680 [12:51:55<58:42,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4487', 'grad_norm': '0.435', 'learning_rate': '3.081e-06', 'ppl': '1.566', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1011', 'tokens/total': 42852352, 'tokens/trainable': 42372696, 'epoch': '7.108'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 5231/5680 [12:51:55<58:42,  7.84s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 5232/5680 [12:52:03<58:33,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5999', 'grad_norm': '0.4623', 'learning_rate': '3.068e-06', 'ppl': '1.822', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 42860544, 'tokens/trainable': 42380868, 'epoch': '7.108'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 5232/5680 [12:52:03<58:33,  7.84s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 5233/5680 [12:52:11<58:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6847', 'grad_norm': '0.5119', 'learning_rate': '3.054e-06', 'ppl': '1.983', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '952.1', 'tokens/total': 42868736, 'tokens/trainable': 42388348, 'epoch': '7.108'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 5233/5680 [12:52:11<58:27,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 5234/5680 [12:52:19<58:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7592', 'grad_norm': '0.446', 'learning_rate': '3.041e-06', 'ppl': '2.137', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 42876928, 'tokens/trainable': 42396456, 'epoch': '7.108'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 5234/5680 [12:52:19<58:22,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5235/5680 [12:52:26<58:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5778', 'grad_norm': '0.5127', 'learning_rate': '3.027e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1018', 'tokens/total': 42885120, 'tokens/trainable': 42404432, 'epoch': '7.108'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5235/5680 [12:52:26<58:12,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5236/5680 [12:52:34<58:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3296', 'grad_norm': '0.3857', 'learning_rate': '3.014e-06', 'ppl': '1.39', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1009', 'tokens/total': 42893312, 'tokens/trainable': 42412340, 'epoch': '7.108'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5236/5680 [12:52:34<58:03,  7.85s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5237/5680 [12:52:42<57:54,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3535', 'grad_norm': '0.3621', 'learning_rate': '3e-06', 'ppl': '1.424', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 42901504, 'tokens/trainable': 42420484, 'epoch': '7.109'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5237/5680 [12:52:42<57:54,  7.84s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5238/5680 [12:52:50<57:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4744', 'grad_norm': '0.4363', 'learning_rate': '2.987e-06', 'ppl': '1.607', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1010', 'tokens/total': 42909696, 'tokens/trainable': 42428444, 'epoch': '7.109'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 5238/5680 [12:52:50<57:51,  7.85s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5239/5680 [12:52:58<57:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3636', 'grad_norm': '0.4531', 'learning_rate': '2.973e-06', 'ppl': '1.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 42917888, 'tokens/trainable': 42436336, 'epoch': '7.109'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5239/5680 [12:52:58<57:45,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5240/5680 [12:53:06<57:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.8414', 'grad_norm': '0.4753', 'learning_rate': '2.96e-06', 'ppl': '2.32', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '967.3', 'tokens/total': 42926080, 'tokens/trainable': 42443952, 'epoch': '7.109'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5240/5680 [12:53:06<57:40,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5241/5680 [12:53:14<57:32,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7532', 'grad_norm': '0.5064', 'learning_rate': '2.947e-06', 'ppl': '2.124', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 42934272, 'tokens/trainable': 42452012, 'epoch': '7.109'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5241/5680 [12:53:14<57:32,  7.87s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5242/5680 [12:53:21<57:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2567', 'grad_norm': '0.3467', 'learning_rate': '2.933e-06', 'ppl': '1.293', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 42942464, 'tokens/trainable': 42459888, 'epoch': '7.11'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 5242/5680 [12:53:21<57:25,  7.87s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 5243/5680 [12:53:29<57:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3442', 'grad_norm': '0.372', 'learning_rate': '2.92e-06', 'ppl': '1.411', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1005', 'tokens/total': 42950656, 'tokens/trainable': 42467768, 'epoch': '7.11'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 5243/5680 [12:53:29<57:14,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 5244/5680 [12:53:37<57:03,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3385', 'grad_norm': '0.4102', 'learning_rate': '2.907e-06', 'ppl': '1.403', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 42958848, 'tokens/trainable': 42475896, 'epoch': '7.11'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 5244/5680 [12:53:37<57:03,  7.85s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 5245/5680 [12:53:45<56:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2801', 'grad_norm': '0.4551', 'learning_rate': '2.894e-06', 'ppl': '1.323', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 42967040, 'tokens/trainable': 42483976, 'epoch': '7.11'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 5245/5680 [12:53:45<56:58,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5246/5680 [12:53:53<56:51,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3564', 'grad_norm': '0.3849', 'learning_rate': '2.88e-06', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 42975232, 'tokens/trainable': 42492132, 'epoch': '7.11'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5246/5680 [12:53:53<56:51,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5247/5680 [12:54:01<56:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3705', 'grad_norm': '0.427', 'learning_rate': '2.867e-06', 'ppl': '1.448', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991.5', 'tokens/total': 42983424, 'tokens/trainable': 42499952, 'epoch': '7.11'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5247/5680 [12:54:01<56:46,  7.87s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5248/5680 [12:54:09<56:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4355', 'grad_norm': '0.4425', 'learning_rate': '2.854e-06', 'ppl': '1.546', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1004', 'tokens/total': 42991616, 'tokens/trainable': 42507836, 'epoch': '7.111'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5248/5680 [12:54:09<56:37,  7.87s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5249/5680 [12:54:17<56:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.623', 'grad_norm': '0.4999', 'learning_rate': '2.841e-06', 'ppl': '1.865', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 42999808, 'tokens/trainable': 42515980, 'epoch': '7.111'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 5249/5680 [12:54:17<56:28,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5250/5680 [12:54:24<56:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5584', 'grad_norm': '0.5202', 'learning_rate': '2.828e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 43008000, 'tokens/trainable': 42524016, 'epoch': '7.111'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5250/5680 [12:54:24<56:22,  7.87s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5251/5680 [12:54:32<56:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4765', 'grad_norm': '0.3904', 'learning_rate': '2.815e-06', 'ppl': '1.61', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43016192, 'tokens/trainable': 42532192, 'epoch': '7.111'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5251/5680 [12:54:32<56:13,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5252/5680 [12:54:40<56:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5285', 'grad_norm': '0.4949', 'learning_rate': '2.802e-06', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1012', 'tokens/total': 43024384, 'tokens/trainable': 42540136, 'epoch': '7.111'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5252/5680 [12:54:40<56:04,  7.86s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5253/5680 [12:54:48<55:47,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4068', 'grad_norm': '0.3925', 'learning_rate': '2.789e-06', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1019', 'tokens/total': 43032576, 'tokens/trainable': 42548072, 'epoch': '7.111'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 5253/5680 [12:54:48<55:47,  7.84s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 5254/5680 [12:54:56<55:39,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5142', 'grad_norm': '0.449', 'learning_rate': '2.776e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 43040768, 'tokens/trainable': 42556216, 'epoch': '7.112'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 5254/5680 [12:54:56<55:39,  7.84s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 5255/5680 [12:55:04<55:30,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3615', 'grad_norm': '0.4217', 'learning_rate': '2.763e-06', 'ppl': '1.435', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 43048960, 'tokens/trainable': 42564372, 'epoch': '7.112'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 5255/5680 [12:55:04<55:30,  7.84s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 5256/5680 [12:55:11<55:25,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3073', 'grad_norm': '0.3394', 'learning_rate': '2.75e-06', 'ppl': '1.36', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1021', 'tokens/total': 43057152, 'tokens/trainable': 42572392, 'epoch': '7.112'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 5256/5680 [12:55:11<55:25,  7.84s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5257/5680 [12:55:19<55:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2367', 'grad_norm': '0.5035', 'learning_rate': '2.737e-06', 'ppl': '1.267', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.5', 'tokens/total': 43065344, 'tokens/trainable': 42580204, 'epoch': '7.112'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5257/5680 [12:55:19<55:23,  7.86s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5258/5680 [12:55:27<55:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3962', 'grad_norm': '0.5039', 'learning_rate': '2.724e-06', 'ppl': '1.486', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '974.7', 'tokens/total': 43073536, 'tokens/trainable': 42587880, 'epoch': '7.112'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5258/5680 [12:55:27<55:18,  7.86s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5259/5680 [12:55:35<55:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3908', 'grad_norm': '0.4088', 'learning_rate': '2.712e-06', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1000', 'tokens/total': 43081728, 'tokens/trainable': 42595728, 'epoch': '7.112'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5259/5680 [12:55:35<55:08,  7.86s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5260/5680 [12:55:43<55:04,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3317', 'grad_norm': '0.4497', 'learning_rate': '2.699e-06', 'ppl': '1.393', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43089920, 'tokens/trainable': 42603888, 'epoch': '7.113'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 5260/5680 [12:55:43<55:04,  7.87s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5261/5680 [12:55:51<54:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3133', 'grad_norm': '0.4304', 'learning_rate': '2.686e-06', 'ppl': '1.368', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '947', 'tokens/total': 43098112, 'tokens/trainable': 42611364, 'epoch': '7.113'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5261/5680 [12:55:51<54:59,  7.88s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5262/5680 [12:55:59<54:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3163', 'grad_norm': '0.3786', 'learning_rate': '2.673e-06', 'ppl': '1.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '989.4', 'tokens/total': 43106304, 'tokens/trainable': 42619128, 'epoch': '7.113'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5262/5680 [12:55:59<54:48,  7.87s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5263/5680 [12:56:07<54:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5558', 'grad_norm': '0.5264', 'learning_rate': '2.661e-06', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43114496, 'tokens/trainable': 42627280, 'epoch': '7.113'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5263/5680 [12:56:07<54:40,  7.87s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5264/5680 [12:56:14<54:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3795', 'grad_norm': '0.3867', 'learning_rate': '2.648e-06', 'ppl': '1.462', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1014', 'tokens/total': 43122688, 'tokens/trainable': 42635216, 'epoch': '7.113'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5264/5680 [12:56:14<54:28,  7.86s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 5265/5680 [12:56:22<54:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5454', 'grad_norm': '0.4634', 'learning_rate': '2.635e-06', 'ppl': '1.725', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '991', 'tokens/total': 43130880, 'tokens/trainable': 42643020, 'epoch': '7.114'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 5265/5680 [12:56:22<54:22,  7.86s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 5266/5680 [12:56:30<54:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5253', 'grad_norm': '0.5011', 'learning_rate': '2.623e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 43139072, 'tokens/trainable': 42650884, 'epoch': '7.114'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 5266/5680 [12:56:30<54:14,  7.86s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 5267/5680 [12:56:38<54:06,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4672', 'grad_norm': '0.4648', 'learning_rate': '2.61e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '979.9', 'tokens/total': 43147264, 'tokens/trainable': 42658588, 'epoch': '7.114'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 5267/5680 [12:56:38<54:06,  7.86s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5268/5680 [12:56:46<54:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5874', 'grad_norm': '0.5196', 'learning_rate': '2.598e-06', 'ppl': '1.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '948.7', 'tokens/total': 43155456, 'tokens/trainable': 42666056, 'epoch': '7.114'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5268/5680 [12:56:46<54:00,  7.86s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5269/5680 [12:56:54<54:26,  7.95s/it]                                                                                                                                                                                                                                             {'loss': '0.489', 'grad_norm': '0.4685', 'learning_rate': '2.585e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '982.4', 'tokens/total': 43163648, 'tokens/trainable': 42674052, 'epoch': '7.114'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5269/5680 [12:56:54<54:26,  7.95s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5270/5680 [12:57:02<54:07,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.2946', 'grad_norm': '0.4347', 'learning_rate': '2.573e-06', 'ppl': '1.343', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '983.2', 'tokens/total': 43171840, 'tokens/trainable': 42681776, 'epoch': '7.114'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5270/5680 [12:57:02<54:07,  7.92s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5271/5680 [12:57:10<53:53,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3324', 'grad_norm': '0.4714', 'learning_rate': '2.56e-06', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.3', 'tokens/total': 43180032, 'tokens/trainable': 42689464, 'epoch': '7.115'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 5271/5680 [12:57:10<53:53,  7.90s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5272/5680 [12:57:18<53:39,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3327', 'grad_norm': '0.3264', 'learning_rate': '2.548e-06', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 43188224, 'tokens/trainable': 42697576, 'epoch': '7.115'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5272/5680 [12:57:18<53:39,  7.89s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5273/5680 [12:57:25<53:30,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3497', 'grad_norm': '0.3409', 'learning_rate': '2.535e-06', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 43196416, 'tokens/trainable': 42705688, 'epoch': '7.115'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5273/5680 [12:57:25<53:30,  7.89s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5274/5680 [12:57:33<53:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3391', 'grad_norm': '0.3909', 'learning_rate': '2.523e-06', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '954', 'tokens/total': 43204608, 'tokens/trainable': 42713156, 'epoch': '7.115'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5274/5680 [12:57:33<53:15,  7.87s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5275/5680 [12:57:41<53:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.207', 'grad_norm': '0.4154', 'learning_rate': '2.511e-06', 'ppl': '1.23', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '970.1', 'tokens/total': 43212800, 'tokens/trainable': 42720792, 'epoch': '7.115'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 5275/5680 [12:57:41<53:08,  7.87s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5276/5680 [12:57:49<52:57,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6619', 'grad_norm': '0.4456', 'learning_rate': '2.498e-06', 'ppl': '1.938', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '957', 'tokens/total': 43220992, 'tokens/trainable': 42728296, 'epoch': '7.115'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5276/5680 [12:57:49<52:57,  7.86s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5277/5680 [12:57:57<52:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3988', 'grad_norm': '0.3838', 'learning_rate': '2.486e-06', 'ppl': '1.49', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '977.8', 'tokens/total': 43229184, 'tokens/trainable': 42735984, 'epoch': '7.116'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5277/5680 [12:57:57<52:49,  7.86s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5278/5680 [12:58:05<52:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5702', 'grad_norm': '0.4321', 'learning_rate': '2.474e-06', 'ppl': '1.769', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.5', 'tokens/total': 43237376, 'tokens/trainable': 42743808, 'epoch': '7.116'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5278/5680 [12:58:05<52:44,  7.87s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5279/5680 [12:58:13<52:48,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.4582', 'grad_norm': '0.49', 'learning_rate': '2.462e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '999.4', 'tokens/total': 43245568, 'tokens/trainable': 42751772, 'epoch': '7.116'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 5279/5680 [12:58:13<52:48,  7.90s/it][2026-01-27 10:47:26,855] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:62458] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 10:47:28,127] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:62458] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None
Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:03<03:24, 27.22 examples/s]Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:04<01:32, 59.04 examples/s]Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:04<00:56, 94.09 examples/s]Tokenizing Prompts (num_proc=54):   7%|███████████▌                                                                                                                                               | 424/5677 [00:04<00:37, 140.39 examples/s]Tokenizing Prompts (num_proc=54):   9%|██████████████▍                                                                                                                                            | 530/5677 [00:05<00:30, 166.71 examples/s]Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:05<00:25, 200.24 examples/s]Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:05<00:21, 234.82 examples/s]Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:06<00:18, 259.49 examples/s]Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:06<00:17, 273.98 examples/s]Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:06<00:16, 286.15 examples/s]Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:06<00:13, 334.96 examples/s]Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:07<00:14, 298.78 examples/s]Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:07<00:13, 307.84 examples/s]Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:08<00:13, 313.66 examples/s]Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:08<00:12, 318.93 examples/s]Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:08<00:12, 315.01 examples/s]Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:08<00:10, 354.58 examples/s]Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:09<00:12, 314.82 examples/s]Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:09<00:11, 310.34 examples/s]Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:09<00:11, 324.26 examples/s]Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:10<00:10, 324.75 examples/s]Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:10<00:10, 328.97 examples/s]Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:10<00:09, 327.85 examples/s]Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:11<00:09, 332.98 examples/s]Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:11<00:09, 325.13 examples/s]Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:11<00:09, 313.97 examples/s]Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:12<00:08, 346.63 examples/s]Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:12<00:07, 341.44 examples/s]Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:12<00:08, 313.45 examples/s]Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:13<00:07, 344.46 examples/s]Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:13<00:07, 343.10 examples/s]Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:13<00:06, 346.61 examples/s]Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:14<00:06, 340.37 examples/s]Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:14<00:06, 338.46 examples/s]Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:14<00:05, 342.52 examples/s]Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:14<00:05, 339.24 examples/s]Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:15<00:05, 340.78 examples/s]Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:15<00:04, 381.17 examples/s]Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:15<00:04, 327.07 examples/s]Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:16<00:04, 326.45 examples/s]Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:16<00:04, 324.92 examples/s]Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:16<00:03, 330.99 examples/s]Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:17<00:03, 334.69 examples/s]Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:17<00:03, 336.01 examples/s]Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:17<00:02, 336.12 examples/s]Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:18<00:02, 338.87 examples/s]Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:18<00:02, 334.82 examples/s]Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:18<00:01, 332.62 examples/s]Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:19<00:01, 332.76 examples/s]Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:19<00:01, 331.50 examples/s]Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:19<00:00, 376.71 examples/s]Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:19<00:00, 362.98 examples/s]Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:20<00:00, 332.34 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:20<00:00, 401.90 examples/s]Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:20<00:00, 271.53 examples/s]
Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s]Dropping Long Sequences:  18%|████████████████████████████▌                                                                                                                                     | 1000/5677 [00:00<00:04, 1010.48 examples/s]Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1379.88 examples/s]Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:02<00:01, 1506.62 examples/s]Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:01, 1655.52 examples/s]Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1722.56 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1795.69 examples/s]Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1611.30 examples/s]
Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s]Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1452.97 examples/s]Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:01<00:01, 2105.07 examples/s]Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2499.31 examples/s]Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2764.52 examples/s]Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:01<00:00, 2854.73 examples/s]Add position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:01<00:00, 2516.94 examples/s]
[2026-01-27 10:48:00,582] [WARNING] [py.warnings._showwarnmsg:109] [PID:62458] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 5280/5680 [12:58:54<2:00:23, 18.06s/it]                                                                                                                                                                                                                                             {'loss': '0.4873', 'grad_norm': '0.4586', 'learning_rate': '2.45e-06', 'ppl': '1.628', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '194.8', 'tokens/total': 43253760, 'tokens/trainable': 42759844, 'epoch': '7.116'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 5280/5680 [12:58:55<2:00:23, 18.06s/it][2026-01-27 10:48:08,771] [WARNING] [datasets.iterable_dataset._iter_pytorch:2405] [PID:62697] Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.
[2026-01-27 10:48:10,026] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:62697] Loading dataset: Guilherme34/best-dataset-glm47flash with base_type: pretrain and prompt_style: None

Tokenizing Prompts (num_proc=54):   0%|                                                                                                                                                                      | 0/5677 [00:00<?, ? examples/s][A
Tokenizing Prompts (num_proc=54):   2%|██▉                                                                                                                                                         | 106/5677 [00:06<05:57, 15.59 examples/s][A
Tokenizing Prompts (num_proc=54):   4%|█████▊                                                                                                                                                      | 212/5677 [00:07<02:39, 34.21 examples/s][A
Tokenizing Prompts (num_proc=54):   6%|████████▋                                                                                                                                                   | 318/5677 [00:07<01:36, 55.52 examples/s][A
Tokenizing Prompts (num_proc=54):   7%|███████████▋                                                                                                                                                | 424/5677 [00:08<01:09, 75.17 examples/s][A
Tokenizing Prompts (num_proc=54):   9%|██████████████▌                                                                                                                                             | 530/5677 [00:09<00:53, 95.97 examples/s][A
Tokenizing Prompts (num_proc=54):  11%|█████████████████▎                                                                                                                                         | 636/5677 [00:09<00:40, 124.62 examples/s][A
Tokenizing Prompts (num_proc=54):  13%|████████████████████▎                                                                                                                                      | 742/5677 [00:09<00:34, 145.12 examples/s][A
Tokenizing Prompts (num_proc=54):  15%|███████████████████████▏                                                                                                                                   | 847/5677 [00:10<00:30, 159.96 examples/s][A
Tokenizing Prompts (num_proc=54):  17%|█████████████████████████▉                                                                                                                                 | 952/5677 [00:10<00:27, 171.81 examples/s][A
Tokenizing Prompts (num_proc=54):  19%|████████████████████████████▋                                                                                                                             | 1057/5677 [00:11<00:24, 188.65 examples/s][A
Tokenizing Prompts (num_proc=54):  20%|███████████████████████████████▌                                                                                                                          | 1162/5677 [00:11<00:24, 187.90 examples/s][A
Tokenizing Prompts (num_proc=54):  22%|██████████████████████████████████▎                                                                                                                       | 1267/5677 [00:12<00:22, 200.06 examples/s][A
Tokenizing Prompts (num_proc=54):  24%|█████████████████████████████████████▏                                                                                                                    | 1372/5677 [00:12<00:21, 201.77 examples/s][A
Tokenizing Prompts (num_proc=54):  26%|████████████████████████████████████████                                                                                                                  | 1477/5677 [00:13<00:20, 206.54 examples/s][A
Tokenizing Prompts (num_proc=54):  28%|██████████████████████████████████████████▉                                                                                                               | 1582/5677 [00:13<00:19, 212.87 examples/s][A
Tokenizing Prompts (num_proc=54):  30%|█████████████████████████████████████████████▊                                                                                                            | 1687/5677 [00:14<00:18, 216.03 examples/s][A
Tokenizing Prompts (num_proc=54):  32%|████████████████████████████████████████████████▌                                                                                                         | 1792/5677 [00:14<00:18, 213.70 examples/s][A
Tokenizing Prompts (num_proc=54):  33%|███████████████████████████████████████████████████▍                                                                                                      | 1897/5677 [00:15<00:17, 218.51 examples/s][A
Tokenizing Prompts (num_proc=54):  35%|██████████████████████████████████████████████████████▎                                                                                                   | 2002/5677 [00:15<00:17, 214.05 examples/s][A
Tokenizing Prompts (num_proc=54):  37%|█████████████████████████████████████████████████████████▏                                                                                                | 2107/5677 [00:16<00:15, 228.88 examples/s][A
Tokenizing Prompts (num_proc=54):  39%|████████████████████████████████████████████████████████████                                                                                              | 2212/5677 [00:16<00:15, 223.58 examples/s][A
Tokenizing Prompts (num_proc=54):  41%|██████████████████████████████████████████████████████████████▊                                                                                           | 2317/5677 [00:17<00:15, 218.27 examples/s][A
Tokenizing Prompts (num_proc=54):  43%|█████████████████████████████████████████████████████████████████▋                                                                                        | 2422/5677 [00:17<00:14, 228.98 examples/s][A
Tokenizing Prompts (num_proc=54):  45%|████████████████████████████████████████████████████████████████████▌                                                                                     | 2527/5677 [00:17<00:13, 225.95 examples/s][A
Tokenizing Prompts (num_proc=54):  46%|███████████████████████████████████████████████████████████████████████▍                                                                                  | 2632/5677 [00:18<00:13, 226.97 examples/s][A
Tokenizing Prompts (num_proc=54):  48%|██████████████████████████████████████████████████████████████████████████▏                                                                               | 2737/5677 [00:18<00:13, 223.92 examples/s][A
Tokenizing Prompts (num_proc=54):  50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 2842/5677 [00:19<00:12, 220.47 examples/s][A
Tokenizing Prompts (num_proc=54):  52%|███████████████████████████████████████████████████████████████████████████████▉                                                                          | 2947/5677 [00:19<00:12, 222.10 examples/s][A
Tokenizing Prompts (num_proc=54):  54%|██████████████████████████████████████████████████████████████████████████████████▊                                                                       | 3052/5677 [00:20<00:12, 208.74 examples/s][A
Tokenizing Prompts (num_proc=54):  56%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3157/5677 [00:20<00:11, 222.48 examples/s][A
Tokenizing Prompts (num_proc=54):  57%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3262/5677 [00:21<00:10, 221.60 examples/s][A
Tokenizing Prompts (num_proc=54):  59%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 3367/5677 [00:21<00:10, 221.39 examples/s][A
Tokenizing Prompts (num_proc=54):  61%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3472/5677 [00:22<00:09, 229.25 examples/s][A
Tokenizing Prompts (num_proc=54):  63%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3577/5677 [00:22<00:09, 226.13 examples/s][A
Tokenizing Prompts (num_proc=54):  65%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 3682/5677 [00:23<00:08, 223.50 examples/s][A
Tokenizing Prompts (num_proc=54):  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 3787/5677 [00:23<00:08, 226.05 examples/s][A
Tokenizing Prompts (num_proc=54):  69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 3892/5677 [00:24<00:08, 199.57 examples/s][A
Tokenizing Prompts (num_proc=54):  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 3997/5677 [00:24<00:07, 235.88 examples/s][A
Tokenizing Prompts (num_proc=54):  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 4102/5677 [00:25<00:07, 222.85 examples/s][A
Tokenizing Prompts (num_proc=54):  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 4207/5677 [00:25<00:06, 237.76 examples/s][A
Tokenizing Prompts (num_proc=54):  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 4312/5677 [00:25<00:05, 240.72 examples/s][A
Tokenizing Prompts (num_proc=54):  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 4417/5677 [00:26<00:05, 224.94 examples/s][A
Tokenizing Prompts (num_proc=54):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 4522/5677 [00:27<00:05, 209.06 examples/s][A
Tokenizing Prompts (num_proc=54):  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4627/5677 [00:27<00:04, 226.02 examples/s][A
Tokenizing Prompts (num_proc=54):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 4732/5677 [00:27<00:04, 228.27 examples/s][A
Tokenizing Prompts (num_proc=54):  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4837/5677 [00:28<00:03, 232.38 examples/s][A
Tokenizing Prompts (num_proc=54):  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 4942/5677 [00:28<00:03, 232.81 examples/s][A
Tokenizing Prompts (num_proc=54):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 5047/5677 [00:29<00:02, 215.93 examples/s][A
Tokenizing Prompts (num_proc=54):  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 5152/5677 [00:29<00:02, 226.40 examples/s][A
Tokenizing Prompts (num_proc=54):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5257/5677 [00:30<00:01, 226.91 examples/s][A
Tokenizing Prompts (num_proc=54):  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5362/5677 [00:30<00:01, 219.55 examples/s][A
Tokenizing Prompts (num_proc=54):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5467/5677 [00:31<00:00, 219.70 examples/s][A
Tokenizing Prompts (num_proc=54):  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5572/5677 [00:31<00:00, 227.55 examples/s][A
Tokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:31<00:00, 248.24 examples/s][ATokenizing Prompts (num_proc=54): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:32<00:00, 172.15 examples/s]

Dropping Long Sequences:   0%|                                                                                                                                                                               | 0/5677 [00:00<?, ? examples/s][A
Dropping Long Sequences:  18%|████████████████████████████▌                                                                                                                                     | 1000/5677 [00:00<00:04, 1044.51 examples/s][A
Dropping Long Sequences:  35%|█████████████████████████████████████████████████████████                                                                                                         | 2000/5677 [00:01<00:02, 1407.93 examples/s][A
Dropping Long Sequences:  53%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 3000/5677 [00:01<00:01, 1641.14 examples/s][A
Dropping Long Sequences:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                               | 4000/5677 [00:02<00:00, 1724.00 examples/s][A
Dropping Long Sequences:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 5000/5677 [00:03<00:00, 1734.78 examples/s][A
Dropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1767.45 examples/s][ADropping Long Sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5677/5677 [00:03<00:00, 1636.34 examples/s]

Add position_id column (Pretraining Sample Packing):   0%|                                                                                                                                                   | 0/5031 [00:00<?, ? examples/s][A
Add position_id column (Pretraining Sample Packing):  20%|██████████████████████████▋                                                                                                           | 1000/5031 [00:00<00:02, 1513.53 examples/s][A
Add position_id column (Pretraining Sample Packing):  40%|█████████████████████████████████████████████████████▎                                                                                | 2000/5031 [00:00<00:01, 2209.60 examples/s][A
Add position_id column (Pretraining Sample Packing):  60%|███████████████████████████████████████████████████████████████████████████████▉                                                      | 3000/5031 [00:01<00:00, 2570.96 examples/s][A
Add position_id column (Pretraining Sample Packing):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 4000/5031 [00:01<00:00, 2797.96 examples/s][A
Add position_id column (Pretraining Sample Packing):  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5000/5031 [00:01<00:00, 2879.92 examples/s][AAdd position_id column (Pretraining Sample Packing): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5031/5031 [00:01<00:00, 2570.00 examples/s]
[2026-01-27 10:48:48,980] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:62697] Using single process for pack_parallel, running sequentially.
[2026-01-27 10:48:54,016] [WARNING] [py.warnings._showwarnmsg:109] [PID:62697] /apool/venvi/lib/python3.11/site-packages/datasets/formatting/torch_formatter.py:222: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})

 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 5281/5680 [12:59:48<3:10:43, 28.68s/it]                                                                                                                                                                                                                                             {'loss': '0.5469', 'grad_norm': '0.43', 'learning_rate': '2.437e-06', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43261952, 'tokens/trainable': 42768024, 'epoch': '8'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 5281/5680 [12:59:48<3:10:43, 28.68s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 5282/5680 [12:59:56<2:28:52, 22.44s/it]                                                                                                                                                                                                                                             {'loss': '0.5324', 'grad_norm': '0.4714', 'learning_rate': '2.425e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 43270144, 'tokens/trainable': 42776176, 'epoch': '8'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 5282/5680 [12:59:56<2:28:52, 22.44s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5283/5680 [13:00:04<1:59:34, 18.07s/it]                                                                                                                                                                                                                                             {'loss': '0.4777', 'grad_norm': '0.4116', 'learning_rate': '2.413e-06', 'ppl': '1.612', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43278336, 'tokens/trainable': 42784320, 'epoch': '8.001'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5283/5680 [13:00:04<1:59:34, 18.07s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5284/5680 [13:00:12<1:39:00, 15.00s/it]                                                                                                                                                                                                                                             {'loss': '0.418', 'grad_norm': '0.4365', 'learning_rate': '2.401e-06', 'ppl': '1.519', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 43286528, 'tokens/trainable': 42792452, 'epoch': '8.001'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5284/5680 [13:00:12<1:39:00, 15.00s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5285/5680 [13:00:19<1:24:34, 12.85s/it]                                                                                                                                                                                                                                             {'loss': '0.595', 'grad_norm': '0.4513', 'learning_rate': '2.389e-06', 'ppl': '1.813', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 43294720, 'tokens/trainable': 42800608, 'epoch': '8.001'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5285/5680 [13:00:19<1:24:34, 12.85s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5286/5680 [13:00:27<1:14:34, 11.36s/it]                                                                                                                                                                                                                                             {'loss': '0.3046', 'grad_norm': '0.3616', 'learning_rate': '2.377e-06', 'ppl': '1.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 43302912, 'tokens/trainable': 42808728, 'epoch': '8.001'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5286/5680 [13:00:27<1:14:34, 11.36s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5287/5680 [13:00:35<1:07:30, 10.31s/it]                                                                                                                                                                                                                                             {'loss': '0.4596', 'grad_norm': '0.4101', 'learning_rate': '2.365e-06', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 43311104, 'tokens/trainable': 42816888, 'epoch': '8.001'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5287/5680 [13:00:35<1:07:30, 10.31s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5288/5680 [13:00:43<1:02:33,  9.58s/it]                                                                                                                                                                                                                                             {'loss': '0.4588', 'grad_norm': '0.4739', 'learning_rate': '2.353e-06', 'ppl': '1.582', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 43319296, 'tokens/trainable': 42825008, 'epoch': '8.001'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5288/5680 [13:00:43<1:02:33,  9.58s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5289/5680 [13:00:51<59:04,  9.06s/it]                                                                                                                                                                                                                                             {'loss': '0.4959', 'grad_norm': '0.4706', 'learning_rate': '2.341e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43327488, 'tokens/trainable': 42833168, 'epoch': '8.002'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5289/5680 [13:00:51<59:04,  9.06s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5290/5680 [13:00:59<56:40,  8.72s/it]                                                                                                                                                                                                                                             {'loss': '0.2606', 'grad_norm': '0.3674', 'learning_rate': '2.329e-06', 'ppl': '1.298', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 43335680, 'tokens/trainable': 42841328, 'epoch': '8.002'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 5290/5680 [13:00:59<56:40,  8.72s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5291/5680 [13:01:07<54:48,  8.45s/it]                                                                                                                                                                                                                                             {'loss': '0.5239', 'grad_norm': '0.4843', 'learning_rate': '2.317e-06', 'ppl': '1.689', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 43343872, 'tokens/trainable': 42849464, 'epoch': '8.002'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5291/5680 [13:01:07<54:48,  8.45s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5292/5680 [13:01:14<53:27,  8.27s/it]                                                                                                                                                                                                                                             {'loss': '0.3286', 'grad_norm': '0.3699', 'learning_rate': '2.306e-06', 'ppl': '1.389', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 43352064, 'tokens/trainable': 42857632, 'epoch': '8.002'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5292/5680 [13:01:14<53:27,  8.27s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5293/5680 [13:01:22<52:32,  8.15s/it]                                                                                                                                                                                                                                             {'loss': '0.6269', 'grad_norm': '0.452', 'learning_rate': '2.294e-06', 'ppl': '1.872', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43360256, 'tokens/trainable': 42865812, 'epoch': '8.002'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 5293/5680 [13:01:22<52:32,  8.15s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5294/5680 [13:01:30<51:47,  8.05s/it]                                                                                                                                                                                                                                             {'loss': '0.4729', 'grad_norm': '0.3839', 'learning_rate': '2.282e-06', 'ppl': '1.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 43368448, 'tokens/trainable': 42873976, 'epoch': '8.002'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5294/5680 [13:01:30<51:47,  8.05s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5295/5680 [13:01:38<51:22,  8.01s/it]                                                                                                                                                                                                                                             {'loss': '0.4776', 'grad_norm': '0.4619', 'learning_rate': '2.27e-06', 'ppl': '1.612', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43376640, 'tokens/trainable': 42882160, 'epoch': '8.003'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5295/5680 [13:01:38<51:22,  8.01s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5296/5680 [13:01:46<50:58,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.3362', 'grad_norm': '0.4719', 'learning_rate': '2.259e-06', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43384832, 'tokens/trainable': 42890296, 'epoch': '8.003'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5296/5680 [13:01:46<50:58,  7.96s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5297/5680 [13:01:54<50:40,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.2582', 'grad_norm': '0.3755', 'learning_rate': '2.247e-06', 'ppl': '1.295', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43393024, 'tokens/trainable': 42898456, 'epoch': '8.003'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 5297/5680 [13:01:54<50:40,  7.94s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5298/5680 [13:02:02<50:22,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3938', 'grad_norm': '0.4034', 'learning_rate': '2.235e-06', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43401216, 'tokens/trainable': 42906588, 'epoch': '8.003'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5298/5680 [13:02:02<50:22,  7.91s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5299/5680 [13:02:09<50:08,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.473', 'grad_norm': '0.4799', 'learning_rate': '2.224e-06', 'ppl': '1.605', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 43409408, 'tokens/trainable': 42914704, 'epoch': '8.003'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5299/5680 [13:02:09<50:08,  7.90s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5300/5680 [13:02:17<49:58,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5132', 'grad_norm': '0.4134', 'learning_rate': '2.212e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 43417600, 'tokens/trainable': 42922884, 'epoch': '8.004'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5300/5680 [13:02:17<49:58,  7.89s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5301/5680 [13:02:25<49:44,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4806', 'grad_norm': '0.397', 'learning_rate': '2.201e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43425792, 'tokens/trainable': 42931040, 'epoch': '8.004'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 5301/5680 [13:02:25<49:44,  7.87s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 5302/5680 [13:02:33<49:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6085', 'grad_norm': '0.5312', 'learning_rate': '2.189e-06', 'ppl': '1.838', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43433984, 'tokens/trainable': 42939176, 'epoch': '8.004'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 5302/5680 [13:02:33<49:34,  7.87s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 5303/5680 [13:02:41<49:32,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4583', 'grad_norm': '0.389', 'learning_rate': '2.178e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 43442176, 'tokens/trainable': 42947344, 'epoch': '8.004'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 5303/5680 [13:02:41<49:32,  7.88s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 5304/5680 [13:02:49<49:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.7176', 'grad_norm': '0.4384', 'learning_rate': '2.166e-06', 'ppl': '2.05', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 43450368, 'tokens/trainable': 42955464, 'epoch': '8.004'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 5304/5680 [13:02:49<49:20,  7.87s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5305/5680 [13:02:57<49:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4561', 'grad_norm': '0.3929', 'learning_rate': '2.155e-06', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 43458560, 'tokens/trainable': 42963620, 'epoch': '8.004'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5305/5680 [13:02:57<49:15,  7.88s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5306/5680 [13:03:04<48:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3413', 'grad_norm': '0.4522', 'learning_rate': '2.143e-06', 'ppl': '1.407', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43466752, 'tokens/trainable': 42971748, 'epoch': '8.005'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5306/5680 [13:03:05<48:59,  7.86s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5307/5680 [13:03:12<48:49,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4131', 'grad_norm': '0.399', 'learning_rate': '2.132e-06', 'ppl': '1.511', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 43474944, 'tokens/trainable': 42979940, 'epoch': '8.005'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5307/5680 [13:03:12<48:49,  7.85s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5308/5680 [13:03:20<48:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2399', 'grad_norm': '0.3703', 'learning_rate': '2.121e-06', 'ppl': '1.271', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 43483136, 'tokens/trainable': 42988060, 'epoch': '8.005'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 5308/5680 [13:03:20<48:46,  7.87s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5309/5680 [13:03:28<48:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4716', 'grad_norm': '0.4655', 'learning_rate': '2.109e-06', 'ppl': '1.603', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43491328, 'tokens/trainable': 42996240, 'epoch': '8.005'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5309/5680 [13:03:28<48:38,  7.87s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5310/5680 [13:03:36<48:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4727', 'grad_norm': '0.4369', 'learning_rate': '2.098e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 43499520, 'tokens/trainable': 43004424, 'epoch': '8.005'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5310/5680 [13:03:36<48:27,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5311/5680 [13:03:44<48:26,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3557', 'grad_norm': '0.4208', 'learning_rate': '2.087e-06', 'ppl': '1.427', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 43507712, 'tokens/trainable': 43012576, 'epoch': '8.005'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5311/5680 [13:03:44<48:26,  7.88s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5312/5680 [13:03:52<48:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4834', 'grad_norm': '0.4415', 'learning_rate': '2.075e-06', 'ppl': '1.622', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43515904, 'tokens/trainable': 43020716, 'epoch': '8.006'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 5312/5680 [13:03:52<48:12,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 5313/5680 [13:04:00<48:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5661', 'grad_norm': '0.4053', 'learning_rate': '2.064e-06', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 43524096, 'tokens/trainable': 43028832, 'epoch': '8.006'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 5313/5680 [13:04:00<48:04,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 5314/5680 [13:04:07<47:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4046', 'grad_norm': '0.4474', 'learning_rate': '2.053e-06', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43532288, 'tokens/trainable': 43036992, 'epoch': '8.006'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 5314/5680 [13:04:07<47:58,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 5315/5680 [13:04:15<47:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3833', 'grad_norm': '0.4164', 'learning_rate': '2.042e-06', 'ppl': '1.467', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43540480, 'tokens/trainable': 43045176, 'epoch': '8.006'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 5315/5680 [13:04:15<47:50,  7.87s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5316/5680 [13:04:23<47:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3716', 'grad_norm': '0.376', 'learning_rate': '2.031e-06', 'ppl': '1.45', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43548672, 'tokens/trainable': 43053312, 'epoch': '8.006'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5316/5680 [13:04:23<47:38,  7.85s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5317/5680 [13:04:31<47:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5215', 'grad_norm': '0.3973', 'learning_rate': '2.02e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43556864, 'tokens/trainable': 43061460, 'epoch': '8.007'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5317/5680 [13:04:31<47:31,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5318/5680 [13:04:39<47:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4571', 'grad_norm': '0.4409', 'learning_rate': '2.009e-06', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43565056, 'tokens/trainable': 43069604, 'epoch': '8.007'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5318/5680 [13:04:39<47:25,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5319/5680 [13:04:47<47:51,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4325', 'grad_norm': '0.3796', 'learning_rate': '1.998e-06', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1001', 'tokens/total': 43573248, 'tokens/trainable': 43077784, 'epoch': '8.007'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 5319/5680 [13:04:47<47:51,  7.96s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5320/5680 [13:04:55<47:34,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3906', 'grad_norm': '0.3955', 'learning_rate': '1.987e-06', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43581440, 'tokens/trainable': 43085968, 'epoch': '8.007'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5320/5680 [13:04:55<47:34,  7.93s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5321/5680 [13:05:03<47:18,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5387', 'grad_norm': '0.4131', 'learning_rate': '1.976e-06', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43589632, 'tokens/trainable': 43094144, 'epoch': '8.007'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5321/5680 [13:05:03<47:18,  7.91s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5322/5680 [13:05:11<47:05,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.635', 'grad_norm': '0.4906', 'learning_rate': '1.965e-06', 'ppl': '1.887', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43597824, 'tokens/trainable': 43102276, 'epoch': '8.007'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5322/5680 [13:05:11<47:05,  7.89s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5323/5680 [13:05:18<46:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3356', 'grad_norm': '0.4074', 'learning_rate': '1.954e-06', 'ppl': '1.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43606016, 'tokens/trainable': 43110412, 'epoch': '8.008'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 5323/5680 [13:05:18<46:53,  7.88s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 5324/5680 [13:05:26<46:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.535', 'grad_norm': '0.4239', 'learning_rate': '1.943e-06', 'ppl': '1.707', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43614208, 'tokens/trainable': 43118544, 'epoch': '8.008'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 5324/5680 [13:05:26<46:39,  7.86s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 5325/5680 [13:05:34<46:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5907', 'grad_norm': '0.4721', 'learning_rate': '1.932e-06', 'ppl': '1.805', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43622400, 'tokens/trainable': 43126720, 'epoch': '8.008'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 5325/5680 [13:05:34<46:34,  7.87s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 5326/5680 [13:05:42<46:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4857', 'grad_norm': '0.4088', 'learning_rate': '1.921e-06', 'ppl': '1.625', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 43630592, 'tokens/trainable': 43134872, 'epoch': '8.008'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 5326/5680 [13:05:42<46:27,  7.87s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5327/5680 [13:05:50<46:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.62', 'grad_norm': '0.4391', 'learning_rate': '1.911e-06', 'ppl': '1.859', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43638784, 'tokens/trainable': 43143020, 'epoch': '8.008'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5327/5680 [13:05:50<46:15,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5328/5680 [13:05:58<46:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3549', 'grad_norm': '0.4334', 'learning_rate': '1.9e-06', 'ppl': '1.426', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 43646976, 'tokens/trainable': 43151172, 'epoch': '8.008'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5328/5680 [13:05:58<46:01,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5329/5680 [13:06:06<45:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3932', 'grad_norm': '0.4124', 'learning_rate': '1.889e-06', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 43655168, 'tokens/trainable': 43159352, 'epoch': '8.009'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5329/5680 [13:06:06<45:57,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5330/5680 [13:06:13<45:47,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3764', 'grad_norm': '0.4094', 'learning_rate': '1.879e-06', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 43663360, 'tokens/trainable': 43167512, 'epoch': '8.009'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 5330/5680 [13:06:13<45:47,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5331/5680 [13:06:21<45:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.565', 'grad_norm': '0.4545', 'learning_rate': '1.868e-06', 'ppl': '1.76', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43671552, 'tokens/trainable': 43175636, 'epoch': '8.009'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5331/5680 [13:06:21<45:39,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5332/5680 [13:06:29<45:31,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3942', 'grad_norm': '0.4162', 'learning_rate': '1.857e-06', 'ppl': '1.483', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43679744, 'tokens/trainable': 43183768, 'epoch': '8.009'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5332/5680 [13:06:29<45:31,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5333/5680 [13:06:37<45:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4536', 'grad_norm': '0.4401', 'learning_rate': '1.847e-06', 'ppl': '1.574', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 43687936, 'tokens/trainable': 43191928, 'epoch': '8.009'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5333/5680 [13:06:37<45:25,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5334/5680 [13:06:45<45:17,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5683', 'grad_norm': '0.4232', 'learning_rate': '1.836e-06', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43696128, 'tokens/trainable': 43200064, 'epoch': '8.01'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 5334/5680 [13:06:45<45:17,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 5335/5680 [13:06:53<45:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.58', 'grad_norm': '0.3922', 'learning_rate': '1.826e-06', 'ppl': '1.786', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43704320, 'tokens/trainable': 43208208, 'epoch': '8.01'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 5335/5680 [13:06:53<45:07,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 5336/5680 [13:07:01<45:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3808', 'grad_norm': '0.4221', 'learning_rate': '1.815e-06', 'ppl': '1.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43712512, 'tokens/trainable': 43216384, 'epoch': '8.01'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 5336/5680 [13:07:01<45:03,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 5337/5680 [13:07:08<44:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3597', 'grad_norm': '0.4164', 'learning_rate': '1.805e-06', 'ppl': '1.433', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43720704, 'tokens/trainable': 43224504, 'epoch': '8.01'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 5337/5680 [13:07:08<44:53,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5338/5680 [13:07:16<44:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4036', 'grad_norm': '0.4708', 'learning_rate': '1.794e-06', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 43728896, 'tokens/trainable': 43232644, 'epoch': '8.01'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5338/5680 [13:07:16<44:47,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5339/5680 [13:07:24<44:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.7588', 'grad_norm': '0.4582', 'learning_rate': '1.784e-06', 'ppl': '2.136', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 43737088, 'tokens/trainable': 43240776, 'epoch': '8.01'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5339/5680 [13:07:24<44:40,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5340/5680 [13:07:32<44:29,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3934', 'grad_norm': '0.4132', 'learning_rate': '1.773e-06', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 43745280, 'tokens/trainable': 43248908, 'epoch': '8.011'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5340/5680 [13:07:32<44:29,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5341/5680 [13:07:40<44:19,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3681', 'grad_norm': '0.3912', 'learning_rate': '1.763e-06', 'ppl': '1.445', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43753472, 'tokens/trainable': 43257048, 'epoch': '8.011'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 5341/5680 [13:07:40<44:19,  7.84s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5342/5680 [13:07:48<44:11,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.6906', 'grad_norm': '0.4728', 'learning_rate': '1.753e-06', 'ppl': '1.995', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 43761664, 'tokens/trainable': 43265144, 'epoch': '8.011'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5342/5680 [13:07:48<44:11,  7.84s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5343/5680 [13:07:55<44:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5966', 'grad_norm': '0.4595', 'learning_rate': '1.742e-06', 'ppl': '1.816', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43769856, 'tokens/trainable': 43273308, 'epoch': '8.011'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5343/5680 [13:07:55<44:04,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5344/5680 [13:08:03<43:57,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4724', 'grad_norm': '0.3907', 'learning_rate': '1.732e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 43778048, 'tokens/trainable': 43281476, 'epoch': '8.011'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5344/5680 [13:08:03<43:57,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5345/5680 [13:08:11<43:47,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5743', 'grad_norm': '0.4274', 'learning_rate': '1.722e-06', 'ppl': '1.776', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 43786240, 'tokens/trainable': 43289648, 'epoch': '8.011'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 5345/5680 [13:08:11<43:47,  7.84s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 5346/5680 [13:08:19<43:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.578', 'grad_norm': '0.4691', 'learning_rate': '1.712e-06', 'ppl': '1.782', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 43794432, 'tokens/trainable': 43297792, 'epoch': '8.012'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 5346/5680 [13:08:19<43:43,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 5347/5680 [13:08:27<43:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4724', 'grad_norm': '0.3862', 'learning_rate': '1.701e-06', 'ppl': '1.604', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43802624, 'tokens/trainable': 43305964, 'epoch': '8.012'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 5347/5680 [13:08:27<43:35,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 5348/5680 [13:08:35<43:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6038', 'grad_norm': '0.4608', 'learning_rate': '1.691e-06', 'ppl': '1.829', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43810816, 'tokens/trainable': 43314128, 'epoch': '8.012'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 5348/5680 [13:08:35<43:27,  7.85s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5349/5680 [13:08:43<43:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3212', 'grad_norm': '0.4306', 'learning_rate': '1.681e-06', 'ppl': '1.379', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43819008, 'tokens/trainable': 43322304, 'epoch': '8.012'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5349/5680 [13:08:43<43:24,  7.87s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5350/5680 [13:08:51<43:17,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4634', 'grad_norm': '0.4343', 'learning_rate': '1.671e-06', 'ppl': '1.59', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 43827200, 'tokens/trainable': 43330460, 'epoch': '8.012'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5350/5680 [13:08:51<43:17,  7.87s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5351/5680 [13:08:58<43:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3624', 'grad_norm': '0.3753', 'learning_rate': '1.661e-06', 'ppl': '1.437', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 43835392, 'tokens/trainable': 43338628, 'epoch': '8.012'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5351/5680 [13:08:58<43:08,  7.87s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5352/5680 [13:09:06<42:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2788', 'grad_norm': '0.3898', 'learning_rate': '1.651e-06', 'ppl': '1.322', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 43843584, 'tokens/trainable': 43346800, 'epoch': '8.013'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 5352/5680 [13:09:06<42:58,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5353/5680 [13:09:14<42:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4342', 'grad_norm': '0.4034', 'learning_rate': '1.641e-06', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43851776, 'tokens/trainable': 43354956, 'epoch': '8.013'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5353/5680 [13:09:14<42:49,  7.86s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5354/5680 [13:09:22<42:36,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3764', 'grad_norm': '0.3876', 'learning_rate': '1.631e-06', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 43859968, 'tokens/trainable': 43363076, 'epoch': '8.013'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5354/5680 [13:09:22<42:36,  7.84s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5355/5680 [13:09:30<42:59,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.6013', 'grad_norm': '0.471', 'learning_rate': '1.621e-06', 'ppl': '1.824', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1002', 'tokens/total': 43868160, 'tokens/trainable': 43371244, 'epoch': '8.013'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5355/5680 [13:09:30<42:59,  7.94s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5356/5680 [13:09:38<42:44,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.4581', 'grad_norm': '0.4244', 'learning_rate': '1.611e-06', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43876352, 'tokens/trainable': 43379404, 'epoch': '8.013'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 5356/5680 [13:09:38<42:44,  7.92s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 5357/5680 [13:09:46<42:34,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4315', 'grad_norm': '0.4307', 'learning_rate': '1.601e-06', 'ppl': '1.54', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 43884544, 'tokens/trainable': 43387568, 'epoch': '8.014'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 5357/5680 [13:09:46<42:34,  7.91s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 5358/5680 [13:09:54<42:26,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4193', 'grad_norm': '0.4058', 'learning_rate': '1.592e-06', 'ppl': '1.521', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 43892736, 'tokens/trainable': 43395696, 'epoch': '8.014'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 5358/5680 [13:09:54<42:26,  7.91s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 5359/5680 [13:10:02<42:17,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.431', 'grad_norm': '0.4833', 'learning_rate': '1.582e-06', 'ppl': '1.539', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 43900928, 'tokens/trainable': 43403820, 'epoch': '8.014'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 5359/5680 [13:10:02<42:17,  7.91s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5360/5680 [13:10:09<42:06,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.5876', 'grad_norm': '0.4113', 'learning_rate': '1.572e-06', 'ppl': '1.8', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43909120, 'tokens/trainable': 43411984, 'epoch': '8.014'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5360/5680 [13:10:09<42:06,  7.90s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5361/5680 [13:10:17<41:55,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3482', 'grad_norm': '0.4581', 'learning_rate': '1.562e-06', 'ppl': '1.416', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 43917312, 'tokens/trainable': 43420104, 'epoch': '8.014'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5361/5680 [13:10:17<41:55,  7.88s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5362/5680 [13:10:25<41:50,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3982', 'grad_norm': '0.4238', 'learning_rate': '1.552e-06', 'ppl': '1.489', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 43925504, 'tokens/trainable': 43428224, 'epoch': '8.014'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5362/5680 [13:10:25<41:50,  7.89s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5363/5680 [13:10:33<41:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4288', 'grad_norm': '0.4369', 'learning_rate': '1.543e-06', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 43933696, 'tokens/trainable': 43436400, 'epoch': '8.015'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 5363/5680 [13:10:33<41:37,  7.88s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5364/5680 [13:10:41<41:28,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4804', 'grad_norm': '0.4511', 'learning_rate': '1.533e-06', 'ppl': '1.617', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 43941888, 'tokens/trainable': 43444492, 'epoch': '8.015'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5364/5680 [13:10:41<41:28,  7.87s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5365/5680 [13:10:49<41:28,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3046', 'grad_norm': '0.4117', 'learning_rate': '1.523e-06', 'ppl': '1.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 43950080, 'tokens/trainable': 43452644, 'epoch': '8.015'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5365/5680 [13:10:49<41:28,  7.90s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5366/5680 [13:10:57<41:16,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2692', 'grad_norm': '0.3696', 'learning_rate': '1.514e-06', 'ppl': '1.309', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 43958272, 'tokens/trainable': 43460792, 'epoch': '8.015'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5366/5680 [13:10:57<41:16,  7.89s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5367/5680 [13:11:05<41:05,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6369', 'grad_norm': '0.4154', 'learning_rate': '1.504e-06', 'ppl': '1.891', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 43966464, 'tokens/trainable': 43468948, 'epoch': '8.015'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 5367/5680 [13:11:05<41:05,  7.88s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 5368/5680 [13:11:12<40:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4288', 'grad_norm': '0.3991', 'learning_rate': '1.495e-06', 'ppl': '1.535', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 43974656, 'tokens/trainable': 43477080, 'epoch': '8.015'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 5368/5680 [13:11:12<40:53,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 5369/5680 [13:11:20<40:47,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.505', 'grad_norm': '0.4325', 'learning_rate': '1.485e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 43982848, 'tokens/trainable': 43485208, 'epoch': '8.016'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 5369/5680 [13:11:20<40:47,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 5370/5680 [13:11:28<40:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3217', 'grad_norm': '0.4602', 'learning_rate': '1.476e-06', 'ppl': '1.38', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 43991040, 'tokens/trainable': 43493376, 'epoch': '8.016'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 5370/5680 [13:11:28<40:37,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5371/5680 [13:11:36<40:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3861', 'grad_norm': '0.4531', 'learning_rate': '1.466e-06', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 43999232, 'tokens/trainable': 43501564, 'epoch': '8.016'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5371/5680 [13:11:36<40:27,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5372/5680 [13:11:44<40:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4191', 'grad_norm': '0.4475', 'learning_rate': '1.457e-06', 'ppl': '1.521', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 44007424, 'tokens/trainable': 43509728, 'epoch': '8.016'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5372/5680 [13:11:44<40:22,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5373/5680 [13:11:52<40:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4329', 'grad_norm': '0.4255', 'learning_rate': '1.448e-06', 'ppl': '1.542', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44015616, 'tokens/trainable': 43517864, 'epoch': '8.016'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5373/5680 [13:11:52<40:16,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5374/5680 [13:12:00<40:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.351', 'grad_norm': '0.3926', 'learning_rate': '1.438e-06', 'ppl': '1.421', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44023808, 'tokens/trainable': 43526016, 'epoch': '8.017'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 5374/5680 [13:12:00<40:07,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5375/5680 [13:12:08<39:59,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5952', 'grad_norm': '0.4283', 'learning_rate': '1.429e-06', 'ppl': '1.813', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44032000, 'tokens/trainable': 43534196, 'epoch': '8.017'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5375/5680 [13:12:08<39:59,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5376/5680 [13:12:15<39:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6286', 'grad_norm': '0.472', 'learning_rate': '1.42e-06', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 44040192, 'tokens/trainable': 43542284, 'epoch': '8.017'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5376/5680 [13:12:15<39:53,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5377/5680 [13:12:23<39:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3107', 'grad_norm': '0.3821', 'learning_rate': '1.41e-06', 'ppl': '1.364', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44048384, 'tokens/trainable': 43550372, 'epoch': '8.017'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5377/5680 [13:12:23<39:42,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5378/5680 [13:12:31<39:32,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3755', 'grad_norm': '0.404', 'learning_rate': '1.401e-06', 'ppl': '1.456', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44056576, 'tokens/trainable': 43558504, 'epoch': '8.017'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 5378/5680 [13:12:31<39:32,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5379/5680 [13:12:39<39:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3923', 'grad_norm': '0.3911', 'learning_rate': '1.392e-06', 'ppl': '1.48', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44064768, 'tokens/trainable': 43566620, 'epoch': '8.017'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5379/5680 [13:12:39<39:25,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5380/5680 [13:12:47<39:13,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4218', 'grad_norm': '0.4165', 'learning_rate': '1.383e-06', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 44072960, 'tokens/trainable': 43574768, 'epoch': '8.018'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5380/5680 [13:12:47<39:13,  7.84s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5381/5680 [13:12:55<39:08,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6375', 'grad_norm': '0.4779', 'learning_rate': '1.373e-06', 'ppl': '1.892', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44081152, 'tokens/trainable': 43582892, 'epoch': '8.018'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5381/5680 [13:12:55<39:08,  7.85s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5382/5680 [13:13:03<39:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3143', 'grad_norm': '0.3882', 'learning_rate': '1.364e-06', 'ppl': '1.369', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44089344, 'tokens/trainable': 43591080, 'epoch': '8.018'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 5382/5680 [13:13:03<39:02,  7.86s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 5383/5680 [13:13:10<38:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3736', 'grad_norm': '0.5113', 'learning_rate': '1.355e-06', 'ppl': '1.453', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 44097536, 'tokens/trainable': 43599200, 'epoch': '8.018'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 5383/5680 [13:13:10<38:57,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 5384/5680 [13:13:18<38:50,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4438', 'grad_norm': '0.5082', 'learning_rate': '1.346e-06', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44105728, 'tokens/trainable': 43607312, 'epoch': '8.018'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 5384/5680 [13:13:18<38:50,  7.87s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 5385/5680 [13:13:26<38:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5268', 'grad_norm': '0.4322', 'learning_rate': '1.337e-06', 'ppl': '1.693', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44113920, 'tokens/trainable': 43615444, 'epoch': '8.018'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 5385/5680 [13:13:26<38:39,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5386/5680 [13:13:34<38:33,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6241', 'grad_norm': '0.4596', 'learning_rate': '1.328e-06', 'ppl': '1.867', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 44122112, 'tokens/trainable': 43623604, 'epoch': '8.019'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5386/5680 [13:13:34<38:33,  7.87s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5387/5680 [13:13:42<38:22,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5559', 'grad_norm': '0.4037', 'learning_rate': '1.319e-06', 'ppl': '1.743', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44130304, 'tokens/trainable': 43631728, 'epoch': '8.019'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5387/5680 [13:13:42<38:22,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5388/5680 [13:13:50<38:14,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5117', 'grad_norm': '0.4394', 'learning_rate': '1.31e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44138496, 'tokens/trainable': 43639900, 'epoch': '8.019'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5388/5680 [13:13:50<38:14,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5389/5680 [13:13:58<38:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3358', 'grad_norm': '0.4076', 'learning_rate': '1.301e-06', 'ppl': '1.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44146688, 'tokens/trainable': 43648052, 'epoch': '8.019'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 5389/5680 [13:13:58<38:11,  7.87s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5390/5680 [13:14:05<38:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.1789', 'grad_norm': '0.3254', 'learning_rate': '1.292e-06', 'ppl': '1.196', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44154880, 'tokens/trainable': 43656196, 'epoch': '8.019'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5390/5680 [13:14:05<38:00,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5391/5680 [13:14:13<37:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4106', 'grad_norm': '0.4168', 'learning_rate': '1.284e-06', 'ppl': '1.508', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 44163072, 'tokens/trainable': 43664344, 'epoch': '8.02'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5391/5680 [13:14:13<37:53,  7.87s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5392/5680 [13:14:21<37:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3472', 'grad_norm': '0.4016', 'learning_rate': '1.275e-06', 'ppl': '1.415', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 44171264, 'tokens/trainable': 43672512, 'epoch': '8.02'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5392/5680 [13:14:21<37:42,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5393/5680 [13:14:29<37:30,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3347', 'grad_norm': '0.3293', 'learning_rate': '1.266e-06', 'ppl': '1.398', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 44179456, 'tokens/trainable': 43680676, 'epoch': '8.02'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 5393/5680 [13:14:29<37:30,  7.84s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 5394/5680 [13:14:37<37:25,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4228', 'grad_norm': '0.4464', 'learning_rate': '1.257e-06', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44187648, 'tokens/trainable': 43688780, 'epoch': '8.02'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 5394/5680 [13:14:37<37:25,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 5395/5680 [13:14:45<37:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4675', 'grad_norm': '0.39', 'learning_rate': '1.249e-06', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 44195840, 'tokens/trainable': 43696948, 'epoch': '8.02'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 5395/5680 [13:14:45<37:16,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 5396/5680 [13:14:52<37:05,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5498', 'grad_norm': '0.4106', 'learning_rate': '1.24e-06', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1048', 'tokens/total': 44204032, 'tokens/trainable': 43705132, 'epoch': '8.02'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 5396/5680 [13:14:52<37:05,  7.84s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5397/5680 [13:15:00<36:58,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5768', 'grad_norm': '0.4335', 'learning_rate': '1.231e-06', 'ppl': '1.78', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 44212224, 'tokens/trainable': 43713312, 'epoch': '8.021'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5397/5680 [13:15:00<36:58,  7.84s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5398/5680 [13:15:08<36:50,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4194', 'grad_norm': '0.4027', 'learning_rate': '1.223e-06', 'ppl': '1.521', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 44220416, 'tokens/trainable': 43721500, 'epoch': '8.021'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5398/5680 [13:15:08<36:50,  7.84s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5399/5680 [13:15:16<36:44,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4596', 'grad_norm': '0.3874', 'learning_rate': '1.214e-06', 'ppl': '1.583', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44228608, 'tokens/trainable': 43729660, 'epoch': '8.021'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5399/5680 [13:15:16<36:44,  7.84s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5400/5680 [13:15:24<36:36,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5918', 'grad_norm': '0.3979', 'learning_rate': '1.205e-06', 'ppl': '1.807', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44236800, 'tokens/trainable': 43737800, 'epoch': '8.021'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 5400/5680 [13:15:24<36:36,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5401/5680 [13:15:32<36:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5526', 'grad_norm': '0.4708', 'learning_rate': '1.197e-06', 'ppl': '1.738', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44244992, 'tokens/trainable': 43745944, 'epoch': '8.021'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5401/5680 [13:15:32<36:28,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5402/5680 [13:15:40<36:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6065', 'grad_norm': '0.451', 'learning_rate': '1.188e-06', 'ppl': '1.834', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 44253184, 'tokens/trainable': 43754100, 'epoch': '8.021'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5402/5680 [13:15:40<36:23,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5403/5680 [13:15:47<36:15,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4562', 'grad_norm': '0.458', 'learning_rate': '1.18e-06', 'ppl': '1.578', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44261376, 'tokens/trainable': 43762256, 'epoch': '8.022'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5403/5680 [13:15:47<36:15,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5404/5680 [13:15:55<36:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.361', 'grad_norm': '0.419', 'learning_rate': '1.171e-06', 'ppl': '1.435', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44269568, 'tokens/trainable': 43770428, 'epoch': '8.022'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 5404/5680 [13:15:55<36:07,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 5405/5680 [13:16:03<35:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5623', 'grad_norm': '0.4647', 'learning_rate': '1.163e-06', 'ppl': '1.755', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44277760, 'tokens/trainable': 43778572, 'epoch': '8.022'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 5405/5680 [13:16:03<35:58,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 5406/5680 [13:16:11<35:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6053', 'grad_norm': '0.4674', 'learning_rate': '1.155e-06', 'ppl': '1.832', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44285952, 'tokens/trainable': 43786672, 'epoch': '8.022'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 5406/5680 [13:16:11<35:50,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 5407/5680 [13:16:19<35:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5428', 'grad_norm': '0.4945', 'learning_rate': '1.146e-06', 'ppl': '1.721', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 44294144, 'tokens/trainable': 43794840, 'epoch': '8.022'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 5407/5680 [13:16:19<35:42,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5408/5680 [13:16:27<35:34,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4711', 'grad_norm': '0.4021', 'learning_rate': '1.138e-06', 'ppl': '1.602', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44302336, 'tokens/trainable': 43803012, 'epoch': '8.023'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5408/5680 [13:16:27<35:34,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5409/5680 [13:16:35<35:27,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5774', 'grad_norm': '0.4427', 'learning_rate': '1.13e-06', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44310528, 'tokens/trainable': 43811160, 'epoch': '8.023'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5409/5680 [13:16:35<35:27,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5410/5680 [13:16:42<35:18,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6082', 'grad_norm': '0.4879', 'learning_rate': '1.121e-06', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44318720, 'tokens/trainable': 43819304, 'epoch': '8.023'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5410/5680 [13:16:42<35:18,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5411/5680 [13:16:50<35:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5021', 'grad_norm': '0.3841', 'learning_rate': '1.113e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 44326912, 'tokens/trainable': 43827416, 'epoch': '8.023'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 5411/5680 [13:16:50<35:10,  7.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5412/5680 [13:16:58<35:09,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3241', 'grad_norm': '0.3926', 'learning_rate': '1.105e-06', 'ppl': '1.383', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44335104, 'tokens/trainable': 43835592, 'epoch': '8.023'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5412/5680 [13:16:58<35:09,  7.87s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5413/5680 [13:17:06<35:01,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5874', 'grad_norm': '0.4235', 'learning_rate': '1.097e-06', 'ppl': '1.799', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44343296, 'tokens/trainable': 43843712, 'epoch': '8.023'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5413/5680 [13:17:06<35:01,  7.87s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5414/5680 [13:17:14<34:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4219', 'grad_norm': '0.4068', 'learning_rate': '1.088e-06', 'ppl': '1.525', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44351488, 'tokens/trainable': 43851848, 'epoch': '8.024'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5414/5680 [13:17:14<34:50,  7.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5415/5680 [13:17:22<34:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5401', 'grad_norm': '0.4705', 'learning_rate': '1.08e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44359680, 'tokens/trainable': 43860028, 'epoch': '8.024'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 5415/5680 [13:17:22<34:42,  7.86s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 5416/5680 [13:17:30<34:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5071', 'grad_norm': '0.4584', 'learning_rate': '1.072e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44367872, 'tokens/trainable': 43868208, 'epoch': '8.024'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 5416/5680 [13:17:30<34:35,  7.86s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 5417/5680 [13:17:38<34:30,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3396', 'grad_norm': '0.3804', 'learning_rate': '1.064e-06', 'ppl': '1.404', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44376064, 'tokens/trainable': 43876348, 'epoch': '8.024'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 5417/5680 [13:17:38<34:30,  7.87s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 5418/5680 [13:17:45<34:21,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4115', 'grad_norm': '0.4235', 'learning_rate': '1.056e-06', 'ppl': '1.509', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44384256, 'tokens/trainable': 43884500, 'epoch': '8.024'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 5418/5680 [13:17:45<34:21,  7.87s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5419/5680 [13:17:53<34:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5455', 'grad_norm': '0.4784', 'learning_rate': '1.048e-06', 'ppl': '1.726', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 44392448, 'tokens/trainable': 43892616, 'epoch': '8.024'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5419/5680 [13:17:53<34:15,  7.87s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5420/5680 [13:18:01<34:04,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3882', 'grad_norm': '0.3796', 'learning_rate': '1.04e-06', 'ppl': '1.474', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44400640, 'tokens/trainable': 43900744, 'epoch': '8.025'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5420/5680 [13:18:01<34:04,  7.86s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5421/5680 [13:18:09<33:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4912', 'grad_norm': '0.4602', 'learning_rate': '1.032e-06', 'ppl': '1.634', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44408832, 'tokens/trainable': 43908876, 'epoch': '8.025'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5421/5680 [13:18:09<33:53,  7.85s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5422/5680 [13:18:17<33:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4002', 'grad_norm': '0.3553', 'learning_rate': '1.024e-06', 'ppl': '1.492', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44417024, 'tokens/trainable': 43917048, 'epoch': '8.025'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 5422/5680 [13:18:17<33:47,  7.86s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5423/5680 [13:18:25<33:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4888', 'grad_norm': '0.4305', 'learning_rate': '1.016e-06', 'ppl': '1.63', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 44425216, 'tokens/trainable': 43925232, 'epoch': '8.025'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5423/5680 [13:18:25<33:37,  7.85s/it] 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5424/5680 [13:18:33<33:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.1666', 'grad_norm': '0.3153', 'learning_rate': '1.009e-06', 'ppl': '1.181', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 44433408, 'tokens/trainable': 43933356, 'epoch': '8.025'}
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5424/5680 [13:18:33<33:34,  7.87s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5425/5680 [13:18:40<33:27,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.346', 'grad_norm': '0.4315', 'learning_rate': '1.001e-06', 'ppl': '1.413', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44441600, 'tokens/trainable': 43941488, 'epoch': '8.026'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5425/5680 [13:18:40<33:27,  7.87s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5426/5680 [13:18:48<33:16,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3979', 'grad_norm': '0.4089', 'learning_rate': '9.93e-07', 'ppl': '1.489', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44449792, 'tokens/trainable': 43949612, 'epoch': '8.026'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 5426/5680 [13:18:48<33:16,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5427/5680 [13:18:56<33:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3424', 'grad_norm': '0.3765', 'learning_rate': '9.852e-07', 'ppl': '1.408', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44457984, 'tokens/trainable': 43957704, 'epoch': '8.026'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5427/5680 [13:18:56<33:07,  7.85s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5428/5680 [13:19:04<33:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5984', 'grad_norm': '0.4783', 'learning_rate': '9.775e-07', 'ppl': '1.819', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44466176, 'tokens/trainable': 43965872, 'epoch': '8.026'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5428/5680 [13:19:04<33:00,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5429/5680 [13:19:12<32:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.352', 'grad_norm': '0.4119', 'learning_rate': '9.698e-07', 'ppl': '1.422', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44474368, 'tokens/trainable': 43974016, 'epoch': '8.026'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 5429/5680 [13:19:12<32:51,  7.85s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5430/5680 [13:19:20<32:41,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3619', 'grad_norm': '0.3758', 'learning_rate': '9.621e-07', 'ppl': '1.436', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 44482560, 'tokens/trainable': 43982180, 'epoch': '8.026'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5430/5680 [13:19:20<32:41,  7.85s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5431/5680 [13:19:28<32:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3966', 'grad_norm': '0.4057', 'learning_rate': '9.545e-07', 'ppl': '1.487', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44490752, 'tokens/trainable': 43990312, 'epoch': '8.027'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5431/5680 [13:19:28<32:36,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5432/5680 [13:19:35<32:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5918', 'grad_norm': '0.427', 'learning_rate': '9.469e-07', 'ppl': '1.807', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44498944, 'tokens/trainable': 43998424, 'epoch': '8.027'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5432/5680 [13:19:35<32:28,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5433/5680 [13:19:43<32:20,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3327', 'grad_norm': '0.355', 'learning_rate': '9.393e-07', 'ppl': '1.395', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44507136, 'tokens/trainable': 44006532, 'epoch': '8.027'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 5433/5680 [13:19:43<32:20,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5434/5680 [13:19:51<32:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2517', 'grad_norm': '0.4072', 'learning_rate': '9.317e-07', 'ppl': '1.286', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 44515328, 'tokens/trainable': 44014664, 'epoch': '8.027'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5434/5680 [13:19:51<32:16,  7.87s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5435/5680 [13:19:59<32:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3841', 'grad_norm': '0.3635', 'learning_rate': '9.242e-07', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44523520, 'tokens/trainable': 44022780, 'epoch': '8.027'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5435/5680 [13:19:59<32:08,  7.87s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5436/5680 [13:20:07<31:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5386', 'grad_norm': '0.4677', 'learning_rate': '9.167e-07', 'ppl': '1.714', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44531712, 'tokens/trainable': 44030936, 'epoch': '8.027'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5436/5680 [13:20:07<31:58,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5437/5680 [13:20:15<32:15,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.4209', 'grad_norm': '0.4631', 'learning_rate': '9.093e-07', 'ppl': '1.523', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '998.7', 'tokens/total': 44539904, 'tokens/trainable': 44039124, 'epoch': '8.028'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 5437/5680 [13:20:15<32:15,  7.96s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 5438/5680 [13:20:23<32:00,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3262', 'grad_norm': '0.3926', 'learning_rate': '9.018e-07', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 44548096, 'tokens/trainable': 44047264, 'epoch': '8.028'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 5438/5680 [13:20:23<32:00,  7.94s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 5439/5680 [13:20:31<31:46,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3354', 'grad_norm': '0.4044', 'learning_rate': '8.944e-07', 'ppl': '1.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44556288, 'tokens/trainable': 44055348, 'epoch': '8.028'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 5439/5680 [13:20:31<31:46,  7.91s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 5440/5680 [13:20:39<31:35,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3303', 'grad_norm': '0.4261', 'learning_rate': '8.871e-07', 'ppl': '1.391', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44564480, 'tokens/trainable': 44063512, 'epoch': '8.028'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 5440/5680 [13:20:39<31:35,  7.90s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5441/5680 [13:20:46<31:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5641', 'grad_norm': '0.4391', 'learning_rate': '8.797e-07', 'ppl': '1.758', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 44572672, 'tokens/trainable': 44071632, 'epoch': '8.028'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5441/5680 [13:20:46<31:24,  7.88s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5442/5680 [13:20:54<31:14,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3289', 'grad_norm': '0.381', 'learning_rate': '8.724e-07', 'ppl': '1.389', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44580864, 'tokens/trainable': 44079796, 'epoch': '8.029'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5442/5680 [13:20:54<31:14,  7.88s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5443/5680 [13:21:02<31:07,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4306', 'grad_norm': '0.3741', 'learning_rate': '8.652e-07', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44589056, 'tokens/trainable': 44087932, 'epoch': '8.029'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5443/5680 [13:21:02<31:07,  7.88s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5444/5680 [13:21:10<30:59,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4307', 'grad_norm': '0.3722', 'learning_rate': '8.579e-07', 'ppl': '1.538', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44597248, 'tokens/trainable': 44096060, 'epoch': '8.029'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 5444/5680 [13:21:10<30:59,  7.88s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5445/5680 [13:21:18<30:51,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4816', 'grad_norm': '0.4699', 'learning_rate': '8.507e-07', 'ppl': '1.619', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44605440, 'tokens/trainable': 44104168, 'epoch': '8.029'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5445/5680 [13:21:18<30:51,  7.88s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5446/5680 [13:21:26<30:42,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3277', 'grad_norm': '0.3965', 'learning_rate': '8.435e-07', 'ppl': '1.388', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44613632, 'tokens/trainable': 44112344, 'epoch': '8.029'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5446/5680 [13:21:26<30:42,  7.88s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5447/5680 [13:21:34<30:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5534', 'grad_norm': '0.4877', 'learning_rate': '8.364e-07', 'ppl': '1.739', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 44621824, 'tokens/trainable': 44120504, 'epoch': '8.029'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5447/5680 [13:21:34<30:30,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5448/5680 [13:21:42<30:22,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3', 'grad_norm': '0.3361', 'learning_rate': '8.292e-07', 'ppl': '1.35', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44630016, 'tokens/trainable': 44128648, 'epoch': '8.03'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 5448/5680 [13:21:42<30:22,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 5449/5680 [13:21:49<30:12,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3961', 'grad_norm': '0.4983', 'learning_rate': '8.222e-07', 'ppl': '1.486', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 44638208, 'tokens/trainable': 44136756, 'epoch': '8.03'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 5449/5680 [13:21:49<30:12,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 5450/5680 [13:21:57<30:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4062', 'grad_norm': '0.4264', 'learning_rate': '8.151e-07', 'ppl': '1.501', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44646400, 'tokens/trainable': 44144892, 'epoch': '8.03'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 5450/5680 [13:21:57<30:04,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 5451/5680 [13:22:05<29:59,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3032', 'grad_norm': '0.3675', 'learning_rate': '8.081e-07', 'ppl': '1.354', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44654592, 'tokens/trainable': 44153008, 'epoch': '8.03'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 5451/5680 [13:22:05<29:59,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5452/5680 [13:22:13<29:52,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4432', 'grad_norm': '0.4235', 'learning_rate': '8.011e-07', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 44662784, 'tokens/trainable': 44161084, 'epoch': '8.03'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5452/5680 [13:22:13<29:52,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5453/5680 [13:22:21<29:42,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3707', 'grad_norm': '0.437', 'learning_rate': '7.941e-07', 'ppl': '1.449', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44670976, 'tokens/trainable': 44169224, 'epoch': '8.03'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5453/5680 [13:22:21<29:42,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5454/5680 [13:22:29<29:33,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3871', 'grad_norm': '0.4115', 'learning_rate': '7.871e-07', 'ppl': '1.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44679168, 'tokens/trainable': 44177336, 'epoch': '8.031'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5454/5680 [13:22:29<29:33,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5455/5680 [13:22:37<29:47,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.4931', 'grad_norm': '0.4402', 'learning_rate': '7.802e-07', 'ppl': '1.637', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990.9', 'tokens/total': 44687360, 'tokens/trainable': 44185432, 'epoch': '8.031'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 5455/5680 [13:22:37<29:47,  7.94s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5456/5680 [13:22:45<29:33,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5082', 'grad_norm': '0.5579', 'learning_rate': '7.734e-07', 'ppl': '1.662', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44695552, 'tokens/trainable': 44193540, 'epoch': '8.031'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5456/5680 [13:22:45<29:33,  7.92s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5457/5680 [13:22:52<29:19,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2835', 'grad_norm': '0.3977', 'learning_rate': '7.665e-07', 'ppl': '1.328', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 44703744, 'tokens/trainable': 44201688, 'epoch': '8.031'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5457/5680 [13:22:52<29:19,  7.89s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5458/5680 [13:23:00<29:10,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4009', 'grad_norm': '0.3727', 'learning_rate': '7.597e-07', 'ppl': '1.493', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 44711936, 'tokens/trainable': 44209776, 'epoch': '8.031'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5458/5680 [13:23:00<29:10,  7.89s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5459/5680 [13:23:08<28:59,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5496', 'grad_norm': '0.4025', 'learning_rate': '7.529e-07', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44720128, 'tokens/trainable': 44217932, 'epoch': '8.032'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 5459/5680 [13:23:08<28:59,  7.87s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 5460/5680 [13:23:16<28:49,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6534', 'grad_norm': '0.4523', 'learning_rate': '7.461e-07', 'ppl': '1.922', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 44728320, 'tokens/trainable': 44226116, 'epoch': '8.032'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 5460/5680 [13:23:16<28:49,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 5461/5680 [13:23:24<28:41,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.421', 'grad_norm': '0.4083', 'learning_rate': '7.394e-07', 'ppl': '1.524', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 44736512, 'tokens/trainable': 44234248, 'epoch': '8.032'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 5461/5680 [13:23:24<28:41,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 5462/5680 [13:23:32<28:30,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3256', 'grad_norm': '0.4123', 'learning_rate': '7.327e-07', 'ppl': '1.385', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 44744704, 'tokens/trainable': 44242392, 'epoch': '8.032'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 5462/5680 [13:23:32<28:30,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5463/5680 [13:23:40<28:21,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4421', 'grad_norm': '0.4285', 'learning_rate': '7.26e-07', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 44752896, 'tokens/trainable': 44250568, 'epoch': '8.032'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5463/5680 [13:23:40<28:21,  7.84s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5464/5680 [13:23:47<28:13,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3362', 'grad_norm': '0.3785', 'learning_rate': '7.194e-07', 'ppl': '1.4', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 44761088, 'tokens/trainable': 44258728, 'epoch': '8.032'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5464/5680 [13:23:47<28:13,  7.84s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5465/5680 [13:23:55<28:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2625', 'grad_norm': '0.3806', 'learning_rate': '7.128e-07', 'ppl': '1.3', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44769280, 'tokens/trainable': 44266868, 'epoch': '8.033'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5465/5680 [13:23:55<28:09,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5466/5680 [13:24:03<27:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.445', 'grad_norm': '0.4014', 'learning_rate': '7.062e-07', 'ppl': '1.561', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 44777472, 'tokens/trainable': 44275032, 'epoch': '8.033'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 5466/5680 [13:24:03<27:59,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5467/5680 [13:24:11<27:52,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6121', 'grad_norm': '0.4194', 'learning_rate': '6.997e-07', 'ppl': '1.844', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44785664, 'tokens/trainable': 44283144, 'epoch': '8.033'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5467/5680 [13:24:11<27:52,  7.85s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5468/5680 [13:24:19<27:47,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4343', 'grad_norm': '0.389', 'learning_rate': '6.932e-07', 'ppl': '1.544', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 44793856, 'tokens/trainable': 44291256, 'epoch': '8.033'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5468/5680 [13:24:19<27:47,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5469/5680 [13:24:27<27:40,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.8668', 'grad_norm': '0.4578', 'learning_rate': '6.867e-07', 'ppl': '2.379', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 44802048, 'tokens/trainable': 44299396, 'epoch': '8.033'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5469/5680 [13:24:27<27:40,  7.87s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5470/5680 [13:24:35<27:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4326', 'grad_norm': '0.4379', 'learning_rate': '6.802e-07', 'ppl': '1.541', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44810240, 'tokens/trainable': 44307540, 'epoch': '8.033'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 5470/5680 [13:24:35<27:30,  7.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 5471/5680 [13:24:42<27:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3896', 'grad_norm': '0.3931', 'learning_rate': '6.738e-07', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44818432, 'tokens/trainable': 44315708, 'epoch': '8.034'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 5471/5680 [13:24:42<27:25,  7.87s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 5472/5680 [13:24:50<27:16,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.274', 'grad_norm': '0.3845', 'learning_rate': '6.674e-07', 'ppl': '1.315', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 44826624, 'tokens/trainable': 44323828, 'epoch': '8.034'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 5472/5680 [13:24:50<27:16,  7.87s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 5473/5680 [13:24:58<27:11,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3803', 'grad_norm': '0.3866', 'learning_rate': '6.61e-07', 'ppl': '1.463', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 44834816, 'tokens/trainable': 44332016, 'epoch': '8.034'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 5473/5680 [13:24:58<27:11,  7.88s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5474/5680 [13:25:06<27:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5881', 'grad_norm': '0.4757', 'learning_rate': '6.547e-07', 'ppl': '1.801', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 44843008, 'tokens/trainable': 44340200, 'epoch': '8.034'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5474/5680 [13:25:06<27:00,  7.87s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5475/5680 [13:25:14<26:53,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4435', 'grad_norm': '0.4475', 'learning_rate': '6.484e-07', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 44851200, 'tokens/trainable': 44348372, 'epoch': '8.034'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5475/5680 [13:25:14<26:53,  7.87s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5476/5680 [13:25:22<26:46,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4674', 'grad_norm': '0.4418', 'learning_rate': '6.421e-07', 'ppl': '1.596', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 44859392, 'tokens/trainable': 44356496, 'epoch': '8.035'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5476/5680 [13:25:22<26:46,  7.88s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5477/5680 [13:25:30<26:37,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4577', 'grad_norm': '0.4384', 'learning_rate': '6.359e-07', 'ppl': '1.581', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44867584, 'tokens/trainable': 44364612, 'epoch': '8.035'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 5477/5680 [13:25:30<26:37,  7.87s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5478/5680 [13:25:37<26:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.35', 'grad_norm': '0.3904', 'learning_rate': '6.297e-07', 'ppl': '1.419', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44875776, 'tokens/trainable': 44372760, 'epoch': '8.035'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5478/5680 [13:25:37<26:27,  7.86s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5479/5680 [13:25:45<26:22,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4398', 'grad_norm': '0.4278', 'learning_rate': '6.235e-07', 'ppl': '1.552', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 44883968, 'tokens/trainable': 44380872, 'epoch': '8.035'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5479/5680 [13:25:45<26:22,  7.87s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5480/5680 [13:25:53<26:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5537', 'grad_norm': '0.4548', 'learning_rate': '6.173e-07', 'ppl': '1.74', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 44892160, 'tokens/trainable': 44389064, 'epoch': '8.035'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5480/5680 [13:25:53<26:15,  7.88s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5481/5680 [13:26:01<26:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3862', 'grad_norm': '0.4315', 'learning_rate': '6.112e-07', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44900352, 'tokens/trainable': 44397192, 'epoch': '8.035'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 5481/5680 [13:26:01<26:06,  7.87s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5482/5680 [13:26:09<25:58,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3743', 'grad_norm': '0.3926', 'learning_rate': '6.051e-07', 'ppl': '1.454', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 44908544, 'tokens/trainable': 44405276, 'epoch': '8.036'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5482/5680 [13:26:09<25:58,  7.87s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5483/5680 [13:26:17<25:49,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.412', 'grad_norm': '0.391', 'learning_rate': '5.991e-07', 'ppl': '1.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 44916736, 'tokens/trainable': 44413456, 'epoch': '8.036'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5483/5680 [13:26:17<25:49,  7.87s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5484/5680 [13:26:25<25:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3924', 'grad_norm': '0.43', 'learning_rate': '5.93e-07', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44924928, 'tokens/trainable': 44421584, 'epoch': '8.036'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5484/5680 [13:26:25<25:40,  7.86s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5485/5680 [13:26:33<25:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3222', 'grad_norm': '0.3742', 'learning_rate': '5.87e-07', 'ppl': '1.38', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 44933120, 'tokens/trainable': 44429676, 'epoch': '8.036'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 5485/5680 [13:26:33<25:34,  7.87s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 5486/5680 [13:26:40<25:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6073', 'grad_norm': '0.4568', 'learning_rate': '5.811e-07', 'ppl': '1.835', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 44941312, 'tokens/trainable': 44437792, 'epoch': '8.036'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 5486/5680 [13:26:40<25:23,  7.85s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 5487/5680 [13:26:48<25:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.461', 'grad_norm': '0.4223', 'learning_rate': '5.751e-07', 'ppl': '1.586', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44949504, 'tokens/trainable': 44445948, 'epoch': '8.036'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 5487/5680 [13:26:48<25:15,  7.85s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 5488/5680 [13:26:56<25:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2522', 'grad_norm': '0.3449', 'learning_rate': '5.692e-07', 'ppl': '1.287', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44957696, 'tokens/trainable': 44454116, 'epoch': '8.037'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 5488/5680 [13:26:56<25:07,  7.85s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5489/5680 [13:27:04<24:59,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5545', 'grad_norm': '0.4217', 'learning_rate': '5.633e-07', 'ppl': '1.741', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 44965888, 'tokens/trainable': 44462280, 'epoch': '8.037'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5489/5680 [13:27:04<24:59,  7.85s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5490/5680 [13:27:12<24:53,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5259', 'grad_norm': '0.4203', 'learning_rate': '5.575e-07', 'ppl': '1.692', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 44974080, 'tokens/trainable': 44470412, 'epoch': '8.037'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5490/5680 [13:27:12<24:53,  7.86s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5491/5680 [13:27:20<24:49,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5218', 'grad_norm': '0.4704', 'learning_rate': '5.517e-07', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 44982272, 'tokens/trainable': 44478584, 'epoch': '8.037'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5491/5680 [13:27:20<24:49,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5492/5680 [13:27:28<24:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.6451', 'grad_norm': '0.4333', 'learning_rate': '5.459e-07', 'ppl': '1.906', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 44990464, 'tokens/trainable': 44486744, 'epoch': '8.037'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 5492/5680 [13:27:28<24:39,  7.87s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5493/5680 [13:27:36<24:33,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4507', 'grad_norm': '0.4932', 'learning_rate': '5.401e-07', 'ppl': '1.569', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 44998656, 'tokens/trainable': 44494908, 'epoch': '8.037'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5493/5680 [13:27:36<24:33,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5494/5680 [13:27:43<24:25,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3876', 'grad_norm': '0.3695', 'learning_rate': '5.344e-07', 'ppl': '1.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 45006848, 'tokens/trainable': 44503028, 'epoch': '8.038'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5494/5680 [13:27:43<24:25,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5495/5680 [13:27:51<24:19,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2901', 'grad_norm': '0.4474', 'learning_rate': '5.287e-07', 'ppl': '1.337', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 45015040, 'tokens/trainable': 44511136, 'epoch': '8.038'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5495/5680 [13:27:51<24:19,  7.89s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5496/5680 [13:27:59<24:09,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4016', 'grad_norm': '0.3601', 'learning_rate': '5.23e-07', 'ppl': '1.494', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45023232, 'tokens/trainable': 44519320, 'epoch': '8.038'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 5496/5680 [13:27:59<24:09,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 5497/5680 [13:28:07<24:01,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4491', 'grad_norm': '0.4647', 'learning_rate': '5.174e-07', 'ppl': '1.567', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 45031424, 'tokens/trainable': 44527480, 'epoch': '8.038'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 5497/5680 [13:28:07<24:01,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 5498/5680 [13:28:15<23:53,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.6158', 'grad_norm': '0.5164', 'learning_rate': '5.118e-07', 'ppl': '1.851', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45039616, 'tokens/trainable': 44535668, 'epoch': '8.038'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 5498/5680 [13:28:15<23:53,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 5499/5680 [13:28:23<23:45,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3597', 'grad_norm': '0.446', 'learning_rate': '5.062e-07', 'ppl': '1.433', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45047808, 'tokens/trainable': 44543856, 'epoch': '8.039'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 5499/5680 [13:28:23<23:45,  7.88s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5500/5680 [13:28:31<23:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3298', 'grad_norm': '0.3524', 'learning_rate': '5.007e-07', 'ppl': '1.391', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45056000, 'tokens/trainable': 44552012, 'epoch': '8.039'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5500/5680 [13:28:31<23:35,  7.86s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5501/5680 [13:28:38<23:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3952', 'grad_norm': '0.3573', 'learning_rate': '4.952e-07', 'ppl': '1.485', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45064192, 'tokens/trainable': 44560184, 'epoch': '8.039'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5501/5680 [13:28:38<23:27,  7.86s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5502/5680 [13:28:46<23:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3102', 'grad_norm': '0.3593', 'learning_rate': '4.897e-07', 'ppl': '1.364', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45072384, 'tokens/trainable': 44568364, 'epoch': '8.039'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5502/5680 [13:28:46<23:18,  7.86s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5503/5680 [13:28:54<23:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.267', 'grad_norm': '0.3953', 'learning_rate': '4.842e-07', 'ppl': '1.306', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 45080576, 'tokens/trainable': 44576544, 'epoch': '8.039'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 5503/5680 [13:28:54<23:12,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5504/5680 [13:29:02<23:03,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3168', 'grad_norm': '0.437', 'learning_rate': '4.788e-07', 'ppl': '1.373', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 45088768, 'tokens/trainable': 44584720, 'epoch': '8.039'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5504/5680 [13:29:02<23:03,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5505/5680 [13:29:10<22:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.313', 'grad_norm': '0.417', 'learning_rate': '4.734e-07', 'ppl': '1.368', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1022', 'tokens/total': 45096960, 'tokens/trainable': 44592792, 'epoch': '8.04'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5505/5680 [13:29:10<22:57,  7.87s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5506/5680 [13:29:18<22:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.648', 'grad_norm': '0.4081', 'learning_rate': '4.681e-07', 'ppl': '1.912', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45105152, 'tokens/trainable': 44600920, 'epoch': '8.04'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5506/5680 [13:29:18<22:48,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5507/5680 [13:29:26<22:40,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5146', 'grad_norm': '0.4904', 'learning_rate': '4.627e-07', 'ppl': '1.673', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45113344, 'tokens/trainable': 44609084, 'epoch': '8.04'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 5507/5680 [13:29:26<22:40,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 5508/5680 [13:29:34<22:31,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5771', 'grad_norm': '0.4343', 'learning_rate': '4.574e-07', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45121536, 'tokens/trainable': 44617252, 'epoch': '8.04'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 5508/5680 [13:29:34<22:31,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 5509/5680 [13:29:41<22:24,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4231', 'grad_norm': '0.4292', 'learning_rate': '4.522e-07', 'ppl': '1.527', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 45129728, 'tokens/trainable': 44625352, 'epoch': '8.04'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 5509/5680 [13:29:41<22:24,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 5510/5680 [13:29:49<22:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5309', 'grad_norm': '0.4773', 'learning_rate': '4.469e-07', 'ppl': '1.7', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45137920, 'tokens/trainable': 44633524, 'epoch': '8.04'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 5510/5680 [13:29:49<22:15,  7.85s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5511/5680 [13:29:57<22:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5283', 'grad_norm': '0.4043', 'learning_rate': '4.417e-07', 'ppl': '1.696', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45146112, 'tokens/trainable': 44641704, 'epoch': '8.041'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5511/5680 [13:29:57<22:07,  7.85s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5512/5680 [13:30:05<22:00,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3046', 'grad_norm': '0.3752', 'learning_rate': '4.365e-07', 'ppl': '1.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45154304, 'tokens/trainable': 44649872, 'epoch': '8.041'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5512/5680 [13:30:05<22:00,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5513/5680 [13:30:13<21:56,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3147', 'grad_norm': '0.4245', 'learning_rate': '4.314e-07', 'ppl': '1.37', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45162496, 'tokens/trainable': 44658052, 'epoch': '8.041'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5513/5680 [13:30:13<21:56,  7.88s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5514/5680 [13:30:21<21:45,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6384', 'grad_norm': '0.4245', 'learning_rate': '4.263e-07', 'ppl': '1.893', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45170688, 'tokens/trainable': 44666136, 'epoch': '8.041'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 5514/5680 [13:30:21<21:45,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5515/5680 [13:30:29<21:35,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.435', 'grad_norm': '0.4188', 'learning_rate': '4.212e-07', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45178880, 'tokens/trainable': 44674300, 'epoch': '8.041'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5515/5680 [13:30:29<21:35,  7.85s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5516/5680 [13:30:36<21:28,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4649', 'grad_norm': '0.3951', 'learning_rate': '4.161e-07', 'ppl': '1.592', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45187072, 'tokens/trainable': 44682472, 'epoch': '8.042'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5516/5680 [13:30:36<21:28,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5517/5680 [13:30:44<21:20,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4124', 'grad_norm': '0.4122', 'learning_rate': '4.111e-07', 'ppl': '1.51', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 45195264, 'tokens/trainable': 44690572, 'epoch': '8.042'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5517/5680 [13:30:44<21:20,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5518/5680 [13:30:52<21:12,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3892', 'grad_norm': '0.4331', 'learning_rate': '4.061e-07', 'ppl': '1.476', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45203456, 'tokens/trainable': 44698688, 'epoch': '8.042'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 5518/5680 [13:30:52<21:12,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 5519/5680 [13:31:00<21:06,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4433', 'grad_norm': '0.4334', 'learning_rate': '4.012e-07', 'ppl': '1.558', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45211648, 'tokens/trainable': 44706860, 'epoch': '8.042'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 5519/5680 [13:31:00<21:06,  7.87s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 5520/5680 [13:31:08<20:59,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4142', 'grad_norm': '0.4551', 'learning_rate': '3.962e-07', 'ppl': '1.513', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45219840, 'tokens/trainable': 44715000, 'epoch': '8.042'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 5520/5680 [13:31:08<20:59,  7.87s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 5521/5680 [13:31:16<20:52,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.597', 'grad_norm': '0.4587', 'learning_rate': '3.913e-07', 'ppl': '1.817', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 45228032, 'tokens/trainable': 44723116, 'epoch': '8.042'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 5521/5680 [13:31:16<20:52,  7.87s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5522/5680 [13:31:24<20:44,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5686', 'grad_norm': '0.432', 'learning_rate': '3.864e-07', 'ppl': '1.766', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45236224, 'tokens/trainable': 44731300, 'epoch': '8.043'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5522/5680 [13:31:24<20:44,  7.88s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5523/5680 [13:31:32<20:38,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3931', 'grad_norm': '0.4101', 'learning_rate': '3.816e-07', 'ppl': '1.482', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 45244416, 'tokens/trainable': 44739456, 'epoch': '8.043'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5523/5680 [13:31:32<20:38,  7.89s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5524/5680 [13:31:39<20:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3484', 'grad_norm': '0.4226', 'learning_rate': '3.768e-07', 'ppl': '1.417', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45252608, 'tokens/trainable': 44747632, 'epoch': '8.043'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5524/5680 [13:31:39<20:29,  7.88s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5525/5680 [13:31:47<20:21,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5416', 'grad_norm': '0.4405', 'learning_rate': '3.72e-07', 'ppl': '1.719', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 45260800, 'tokens/trainable': 44755744, 'epoch': '8.043'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 5525/5680 [13:31:47<20:21,  7.88s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5526/5680 [13:31:55<20:11,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5416', 'grad_norm': '0.4563', 'learning_rate': '3.673e-07', 'ppl': '1.719', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45268992, 'tokens/trainable': 44763904, 'epoch': '8.043'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5526/5680 [13:31:55<20:11,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5527/5680 [13:32:03<20:02,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5083', 'grad_norm': '0.397', 'learning_rate': '3.625e-07', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45277184, 'tokens/trainable': 44772020, 'epoch': '8.043'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5527/5680 [13:32:03<20:02,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5528/5680 [13:32:11<19:52,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4618', 'grad_norm': '0.4101', 'learning_rate': '3.578e-07', 'ppl': '1.587', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 45285376, 'tokens/trainable': 44780172, 'epoch': '8.044'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5528/5680 [13:32:11<19:52,  7.85s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5529/5680 [13:32:19<19:44,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5959', 'grad_norm': '0.4403', 'learning_rate': '3.532e-07', 'ppl': '1.815', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 45293568, 'tokens/trainable': 44788352, 'epoch': '8.044'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 5529/5680 [13:32:19<19:44,  7.84s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 5530/5680 [13:32:27<19:38,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5353', 'grad_norm': '0.4375', 'learning_rate': '3.486e-07', 'ppl': '1.708', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45301760, 'tokens/trainable': 44796508, 'epoch': '8.044'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 5530/5680 [13:32:27<19:38,  7.86s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 5531/5680 [13:32:34<19:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.654', 'grad_norm': '0.4707', 'learning_rate': '3.44e-07', 'ppl': '1.923', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 45309952, 'tokens/trainable': 44804644, 'epoch': '8.044'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 5531/5680 [13:32:34<19:34,  7.88s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 5532/5680 [13:32:42<19:28,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.513', 'grad_norm': '0.4561', 'learning_rate': '3.394e-07', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 45318144, 'tokens/trainable': 44812808, 'epoch': '8.044'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 5532/5680 [13:32:42<19:28,  7.90s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5533/5680 [13:32:50<19:18,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3925', 'grad_norm': '0.4123', 'learning_rate': '3.349e-07', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 45326336, 'tokens/trainable': 44820912, 'epoch': '8.045'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5533/5680 [13:32:50<19:18,  7.88s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5534/5680 [13:32:58<19:08,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4075', 'grad_norm': '0.3693', 'learning_rate': '3.303e-07', 'ppl': '1.503', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45334528, 'tokens/trainable': 44829024, 'epoch': '8.045'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5534/5680 [13:32:58<19:08,  7.87s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5535/5680 [13:33:06<19:00,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5355', 'grad_norm': '0.5126', 'learning_rate': '3.259e-07', 'ppl': '1.708', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45342720, 'tokens/trainable': 44837208, 'epoch': '8.045'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5535/5680 [13:33:06<19:00,  7.87s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5536/5680 [13:33:14<18:51,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4034', 'grad_norm': '0.4266', 'learning_rate': '3.214e-07', 'ppl': '1.497', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45350912, 'tokens/trainable': 44845352, 'epoch': '8.045'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 5536/5680 [13:33:14<18:51,  7.85s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5537/5680 [13:33:22<18:44,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3047', 'grad_norm': '0.3683', 'learning_rate': '3.17e-07', 'ppl': '1.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45359104, 'tokens/trainable': 44853528, 'epoch': '8.045'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5537/5680 [13:33:22<18:44,  7.86s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5538/5680 [13:33:30<18:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4801', 'grad_norm': '0.4196', 'learning_rate': '3.126e-07', 'ppl': '1.616', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45367296, 'tokens/trainable': 44861692, 'epoch': '8.045'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5538/5680 [13:33:30<18:36,  7.86s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5539/5680 [13:33:37<18:26,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.311', 'grad_norm': '0.3747', 'learning_rate': '3.083e-07', 'ppl': '1.365', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 45375488, 'tokens/trainable': 44869804, 'epoch': '8.046'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5539/5680 [13:33:37<18:26,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5540/5680 [13:33:45<18:18,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.7125', 'grad_norm': '0.501', 'learning_rate': '3.039e-07', 'ppl': '2.039', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45383680, 'tokens/trainable': 44877936, 'epoch': '8.046'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 5540/5680 [13:33:45<18:18,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 5541/5680 [13:33:53<18:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5953', 'grad_norm': '0.4954', 'learning_rate': '2.996e-07', 'ppl': '1.814', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45391872, 'tokens/trainable': 44886044, 'epoch': '8.046'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 5541/5680 [13:33:53<18:10,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 5542/5680 [13:34:01<18:02,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3913', 'grad_norm': '0.4037', 'learning_rate': '2.954e-07', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 45400064, 'tokens/trainable': 44894224, 'epoch': '8.046'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 5542/5680 [13:34:01<18:02,  7.84s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 5543/5680 [13:34:09<17:54,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.548', 'grad_norm': '0.4418', 'learning_rate': '2.912e-07', 'ppl': '1.73', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45408256, 'tokens/trainable': 44902344, 'epoch': '8.046'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 5543/5680 [13:34:09<17:54,  7.84s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5544/5680 [13:34:17<17:46,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3685', 'grad_norm': '0.3792', 'learning_rate': '2.87e-07', 'ppl': '1.446', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45416448, 'tokens/trainable': 44910496, 'epoch': '8.046'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5544/5680 [13:34:17<17:46,  7.84s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5545/5680 [13:34:24<17:39,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4051', 'grad_norm': '0.3919', 'learning_rate': '2.828e-07', 'ppl': '1.499', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 45424640, 'tokens/trainable': 44918652, 'epoch': '8.047'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5545/5680 [13:34:24<17:39,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5546/5680 [13:34:32<17:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4897', 'grad_norm': '0.4323', 'learning_rate': '2.786e-07', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45432832, 'tokens/trainable': 44926840, 'epoch': '8.047'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5546/5680 [13:34:32<17:32,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5547/5680 [13:34:40<17:23,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4419', 'grad_norm': '0.5464', 'learning_rate': '2.745e-07', 'ppl': '1.556', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45441024, 'tokens/trainable': 44935016, 'epoch': '8.047'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 5547/5680 [13:34:40<17:23,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5548/5680 [13:34:48<17:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3253', 'grad_norm': '0.4214', 'learning_rate': '2.704e-07', 'ppl': '1.384', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 45449216, 'tokens/trainable': 44943156, 'epoch': '8.047'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5548/5680 [13:34:48<17:15,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5549/5680 [13:34:56<17:08,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4477', 'grad_norm': '0.422', 'learning_rate': '2.664e-07', 'ppl': '1.565', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 45457408, 'tokens/trainable': 44951292, 'epoch': '8.047'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5549/5680 [13:34:56<17:08,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5550/5680 [13:35:04<17:00,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3788', 'grad_norm': '0.3873', 'learning_rate': '2.624e-07', 'ppl': '1.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45465600, 'tokens/trainable': 44959436, 'epoch': '8.048'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5550/5680 [13:35:04<17:00,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5551/5680 [13:35:11<16:52,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.582', 'grad_norm': '0.4286', 'learning_rate': '2.584e-07', 'ppl': '1.79', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 45473792, 'tokens/trainable': 44967584, 'epoch': '8.048'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 5551/5680 [13:35:12<16:52,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 5552/5680 [13:35:19<16:45,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5488', 'grad_norm': '0.4154', 'learning_rate': '2.544e-07', 'ppl': '1.731', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45481984, 'tokens/trainable': 44975728, 'epoch': '8.048'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 5552/5680 [13:35:19<16:45,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 5553/5680 [13:35:27<16:37,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4293', 'grad_norm': '0.3932', 'learning_rate': '2.505e-07', 'ppl': '1.536', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45490176, 'tokens/trainable': 44983888, 'epoch': '8.048'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 5553/5680 [13:35:27<16:37,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 5554/5680 [13:35:35<16:28,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4468', 'grad_norm': '0.4012', 'learning_rate': '2.466e-07', 'ppl': '1.563', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 45498368, 'tokens/trainable': 44992052, 'epoch': '8.048'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 5554/5680 [13:35:35<16:28,  7.85s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5555/5680 [13:35:43<16:43,  8.03s/it]                                                                                                                                                                                                                                             {'loss': '0.3763', 'grad_norm': '0.481', 'learning_rate': '2.427e-07', 'ppl': '1.457', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '964.2', 'tokens/total': 45506560, 'tokens/trainable': 45000192, 'epoch': '8.048'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5555/5680 [13:35:43<16:43,  8.03s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5556/5680 [13:35:51<16:30,  7.99s/it]                                                                                                                                                                                                                                             {'loss': '0.3451', 'grad_norm': '0.4112', 'learning_rate': '2.389e-07', 'ppl': '1.412', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 45514752, 'tokens/trainable': 45008332, 'epoch': '8.049'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5556/5680 [13:35:51<16:30,  7.99s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5557/5680 [13:35:59<16:18,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.406', 'grad_norm': '0.6118', 'learning_rate': '2.351e-07', 'ppl': '1.501', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45522944, 'tokens/trainable': 45016468, 'epoch': '8.049'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5557/5680 [13:35:59<16:18,  7.96s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5558/5680 [13:36:07<16:06,  7.92s/it]                                                                                                                                                                                                                                             {'loss': '0.5997', 'grad_norm': '0.52', 'learning_rate': '2.313e-07', 'ppl': '1.822', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 45531136, 'tokens/trainable': 45024616, 'epoch': '8.049'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 5558/5680 [13:36:07<16:06,  7.92s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5559/5680 [13:36:15<15:57,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4545', 'grad_norm': '0.3879', 'learning_rate': '2.276e-07', 'ppl': '1.575', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 45539328, 'tokens/trainable': 45032740, 'epoch': '8.049'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5559/5680 [13:36:15<15:57,  7.91s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5560/5680 [13:36:23<15:46,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.5618', 'grad_norm': '0.6014', 'learning_rate': '2.239e-07', 'ppl': '1.754', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45547520, 'tokens/trainable': 45040848, 'epoch': '8.049'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5560/5680 [13:36:23<15:46,  7.89s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5561/5680 [13:36:31<15:37,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5812', 'grad_norm': '0.456', 'learning_rate': '2.202e-07', 'ppl': '1.788', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45555712, 'tokens/trainable': 45049024, 'epoch': '8.049'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5561/5680 [13:36:31<15:37,  7.88s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5562/5680 [13:36:39<15:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4352', 'grad_norm': '0.4166', 'learning_rate': '2.165e-07', 'ppl': '1.545', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45563904, 'tokens/trainable': 45057156, 'epoch': '8.05'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 5562/5680 [13:36:39<15:29,  7.88s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 5563/5680 [13:36:46<15:20,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3827', 'grad_norm': '0.3929', 'learning_rate': '2.129e-07', 'ppl': '1.466', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45572096, 'tokens/trainable': 45065336, 'epoch': '8.05'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 5563/5680 [13:36:46<15:20,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 5564/5680 [13:36:54<15:10,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.5345', 'grad_norm': '0.4597', 'learning_rate': '2.093e-07', 'ppl': '1.707', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 45580288, 'tokens/trainable': 45073480, 'epoch': '8.05'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 5564/5680 [13:36:54<15:10,  7.85s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 5565/5680 [13:37:02<15:01,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3861', 'grad_norm': '0.368', 'learning_rate': '2.058e-07', 'ppl': '1.471', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 45588480, 'tokens/trainable': 45081668, 'epoch': '8.05'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 5565/5680 [13:37:02<15:01,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5566/5680 [13:37:10<14:53,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3236', 'grad_norm': '0.4823', 'learning_rate': '2.022e-07', 'ppl': '1.382', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 45596672, 'tokens/trainable': 45089824, 'epoch': '8.05'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5566/5680 [13:37:10<14:53,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5567/5680 [13:37:18<14:46,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4103', 'grad_norm': '0.4458', 'learning_rate': '1.987e-07', 'ppl': '1.507', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45604864, 'tokens/trainable': 45097960, 'epoch': '8.051'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5567/5680 [13:37:18<14:46,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5568/5680 [13:37:26<14:39,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4551', 'grad_norm': '0.3924', 'learning_rate': '1.952e-07', 'ppl': '1.576', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 45613056, 'tokens/trainable': 45106040, 'epoch': '8.051'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5568/5680 [13:37:26<14:39,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5569/5680 [13:37:34<14:34,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3771', 'grad_norm': '0.4385', 'learning_rate': '1.918e-07', 'ppl': '1.458', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 45621248, 'tokens/trainable': 45114160, 'epoch': '8.051'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 5569/5680 [13:37:34<14:34,  7.88s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5570/5680 [13:37:41<14:25,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5334', 'grad_norm': '0.4275', 'learning_rate': '1.884e-07', 'ppl': '1.705', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 45629440, 'tokens/trainable': 45122316, 'epoch': '8.051'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5570/5680 [13:37:41<14:25,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5571/5680 [13:37:49<14:18,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4784', 'grad_norm': '0.4365', 'learning_rate': '1.85e-07', 'ppl': '1.613', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 45637632, 'tokens/trainable': 45130420, 'epoch': '8.051'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5571/5680 [13:37:49<14:18,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5572/5680 [13:37:57<14:08,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.653', 'grad_norm': '0.4687', 'learning_rate': '1.817e-07', 'ppl': '1.921', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 45645824, 'tokens/trainable': 45138592, 'epoch': '8.051'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5572/5680 [13:37:57<14:08,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5573/5680 [13:38:05<14:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3289', 'grad_norm': '0.3842', 'learning_rate': '1.784e-07', 'ppl': '1.389', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45654016, 'tokens/trainable': 45146748, 'epoch': '8.052'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 5573/5680 [13:38:05<14:02,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 5574/5680 [13:38:13<13:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3727', 'grad_norm': '0.4278', 'learning_rate': '1.751e-07', 'ppl': '1.452', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 45662208, 'tokens/trainable': 45154928, 'epoch': '8.052'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 5574/5680 [13:38:13<13:54,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 5575/5680 [13:38:21<13:46,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4268', 'grad_norm': '0.491', 'learning_rate': '1.718e-07', 'ppl': '1.532', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 45670400, 'tokens/trainable': 45163064, 'epoch': '8.052'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 5575/5680 [13:38:21<13:46,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 5576/5680 [13:38:29<13:37,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5037', 'grad_norm': '0.5067', 'learning_rate': '1.686e-07', 'ppl': '1.655', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45678592, 'tokens/trainable': 45171212, 'epoch': '8.052'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 5576/5680 [13:38:29<13:37,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5577/5680 [13:38:36<13:30,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6079', 'grad_norm': '0.516', 'learning_rate': '1.654e-07', 'ppl': '1.837', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 45686784, 'tokens/trainable': 45179320, 'epoch': '8.052'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5577/5680 [13:38:36<13:30,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5578/5680 [13:38:44<13:24,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3158', 'grad_norm': '0.4128', 'learning_rate': '1.622e-07', 'ppl': '1.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1025', 'tokens/total': 45694976, 'tokens/trainable': 45187440, 'epoch': '8.052'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5578/5680 [13:38:44<13:24,  7.88s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5579/5680 [13:38:52<13:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5159', 'grad_norm': '0.4881', 'learning_rate': '1.591e-07', 'ppl': '1.675', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45703168, 'tokens/trainable': 45195580, 'epoch': '8.053'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5579/5680 [13:38:52<13:15,  7.88s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5580/5680 [13:39:00<13:07,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3264', 'grad_norm': '0.3423', 'learning_rate': '1.56e-07', 'ppl': '1.386', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45711360, 'tokens/trainable': 45203700, 'epoch': '8.053'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 5580/5680 [13:39:00<13:07,  7.87s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5581/5680 [13:39:08<12:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5679', 'grad_norm': '0.4489', 'learning_rate': '1.529e-07', 'ppl': '1.765', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45719552, 'tokens/trainable': 45211828, 'epoch': '8.053'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5581/5680 [13:39:08<12:58,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5582/5680 [13:39:16<12:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4692', 'grad_norm': '0.4501', 'learning_rate': '1.499e-07', 'ppl': '1.599', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45727744, 'tokens/trainable': 45219976, 'epoch': '8.053'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5582/5680 [13:39:16<12:50,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5583/5680 [13:39:24<12:42,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4906', 'grad_norm': '0.4134', 'learning_rate': '1.469e-07', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 45735936, 'tokens/trainable': 45228124, 'epoch': '8.053'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5583/5680 [13:39:24<12:42,  7.86s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5584/5680 [13:39:31<12:33,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.2884', 'grad_norm': '0.4574', 'learning_rate': '1.439e-07', 'ppl': '1.334', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 45744128, 'tokens/trainable': 45236308, 'epoch': '8.054'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 5584/5680 [13:39:31<12:33,  7.85s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5585/5680 [13:39:39<12:24,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3911', 'grad_norm': '0.4074', 'learning_rate': '1.409e-07', 'ppl': '1.479', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 45752320, 'tokens/trainable': 45244488, 'epoch': '8.054'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5585/5680 [13:39:39<12:24,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5586/5680 [13:39:47<12:16,  7.83s/it]                                                                                                                                                                                                                                             {'loss': '0.4895', 'grad_norm': '0.4187', 'learning_rate': '1.38e-07', 'ppl': '1.632', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1045', 'tokens/total': 45760512, 'tokens/trainable': 45252660, 'epoch': '8.054'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5586/5680 [13:39:47<12:16,  7.83s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5587/5680 [13:39:55<12:09,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5218', 'grad_norm': '0.3741', 'learning_rate': '1.351e-07', 'ppl': '1.685', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45768704, 'tokens/trainable': 45260788, 'epoch': '8.054'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5587/5680 [13:39:55<12:09,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5588/5680 [13:40:03<12:01,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5769', 'grad_norm': '0.48', 'learning_rate': '1.323e-07', 'ppl': '1.781', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 45776896, 'tokens/trainable': 45268972, 'epoch': '8.054'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 5588/5680 [13:40:03<12:01,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 5589/5680 [13:40:11<11:54,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4667', 'grad_norm': '0.4226', 'learning_rate': '1.294e-07', 'ppl': '1.595', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45785088, 'tokens/trainable': 45277152, 'epoch': '8.054'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 5589/5680 [13:40:11<11:54,  7.85s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 5590/5680 [13:40:18<11:45,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4687', 'grad_norm': '0.4066', 'learning_rate': '1.266e-07', 'ppl': '1.598', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 45793280, 'tokens/trainable': 45285324, 'epoch': '8.055'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 5590/5680 [13:40:18<11:45,  7.84s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 5591/5680 [13:40:26<11:38,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4096', 'grad_norm': '0.4506', 'learning_rate': '1.239e-07', 'ppl': '1.506', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45801472, 'tokens/trainable': 45293464, 'epoch': '8.055'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 5591/5680 [13:40:26<11:38,  7.85s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5592/5680 [13:40:34<11:30,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4554', 'grad_norm': '0.4519', 'learning_rate': '1.211e-07', 'ppl': '1.577', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 45809664, 'tokens/trainable': 45301604, 'epoch': '8.055'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5592/5680 [13:40:34<11:30,  7.84s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5593/5680 [13:40:42<11:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3928', 'grad_norm': '0.436', 'learning_rate': '1.184e-07', 'ppl': '1.481', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 45817856, 'tokens/trainable': 45309732, 'epoch': '8.055'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5593/5680 [13:40:42<11:23,  7.86s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5594/5680 [13:40:50<11:15,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6666', 'grad_norm': '0.5339', 'learning_rate': '1.158e-07', 'ppl': '1.948', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45826048, 'tokens/trainable': 45317904, 'epoch': '8.055'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5594/5680 [13:40:50<11:15,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5595/5680 [13:40:58<11:07,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4085', 'grad_norm': '0.3901', 'learning_rate': '1.131e-07', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45834240, 'tokens/trainable': 45326028, 'epoch': '8.055'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 5595/5680 [13:40:58<11:07,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5596/5680 [13:41:06<10:58,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.513', 'grad_norm': '0.4208', 'learning_rate': '1.105e-07', 'ppl': '1.67', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45842432, 'tokens/trainable': 45334172, 'epoch': '8.056'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5596/5680 [13:41:06<10:58,  7.84s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5597/5680 [13:41:13<10:50,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.3564', 'grad_norm': '0.3922', 'learning_rate': '1.079e-07', 'ppl': '1.428', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45850624, 'tokens/trainable': 45342312, 'epoch': '8.056'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5597/5680 [13:41:13<10:50,  7.84s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5598/5680 [13:41:21<10:43,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4822', 'grad_norm': '0.4125', 'learning_rate': '1.054e-07', 'ppl': '1.62', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45858816, 'tokens/trainable': 45350416, 'epoch': '8.056'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5598/5680 [13:41:21<10:43,  7.84s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5599/5680 [13:41:29<10:35,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.4441', 'grad_norm': '0.4159', 'learning_rate': '1.028e-07', 'ppl': '1.559', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45867008, 'tokens/trainable': 45358568, 'epoch': '8.056'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 5599/5680 [13:41:29<10:35,  7.84s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 5600/5680 [13:41:37<10:27,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.589', 'grad_norm': '0.4097', 'learning_rate': '1.003e-07', 'ppl': '1.802', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 45875200, 'tokens/trainable': 45366736, 'epoch': '8.056'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 5600/5680 [13:41:37<10:27,  7.84s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 5601/5680 [13:41:45<10:20,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4225', 'grad_norm': '0.4256', 'learning_rate': '9.788e-08', 'ppl': '1.526', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45883392, 'tokens/trainable': 45374900, 'epoch': '8.057'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 5601/5680 [13:41:45<10:20,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 5602/5680 [13:41:53<10:13,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2891', 'grad_norm': '0.4312', 'learning_rate': '9.545e-08', 'ppl': '1.335', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45891584, 'tokens/trainable': 45383024, 'epoch': '8.057'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 5602/5680 [13:41:53<10:13,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5603/5680 [13:42:01<10:04,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4944', 'grad_norm': '0.4262', 'learning_rate': '9.305e-08', 'ppl': '1.639', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 45899776, 'tokens/trainable': 45391172, 'epoch': '8.057'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5603/5680 [13:42:01<10:04,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5604/5680 [13:42:08<09:56,  7.84s/it]                                                                                                                                                                                                                                             {'loss': '0.5659', 'grad_norm': '0.4209', 'learning_rate': '9.068e-08', 'ppl': '1.761', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45907968, 'tokens/trainable': 45399328, 'epoch': '8.057'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5604/5680 [13:42:08<09:56,  7.84s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5605/5680 [13:42:16<09:48,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3355', 'grad_norm': '0.4276', 'learning_rate': '8.834e-08', 'ppl': '1.399', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 45916160, 'tokens/trainable': 45407460, 'epoch': '8.057'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5605/5680 [13:42:16<09:48,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5606/5680 [13:42:24<09:40,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3321', 'grad_norm': '0.4466', 'learning_rate': '8.603e-08', 'ppl': '1.394', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 45924352, 'tokens/trainable': 45415576, 'epoch': '8.057'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 5606/5680 [13:42:24<09:40,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5607/5680 [13:42:32<09:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.375', 'grad_norm': '0.4046', 'learning_rate': '8.375e-08', 'ppl': '1.455', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1043', 'tokens/total': 45932544, 'tokens/trainable': 45423748, 'epoch': '8.058'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5607/5680 [13:42:32<09:32,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5608/5680 [13:42:40<09:25,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4198', 'grad_norm': '0.484', 'learning_rate': '8.15e-08', 'ppl': '1.522', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 45940736, 'tokens/trainable': 45431896, 'epoch': '8.058'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5608/5680 [13:42:40<09:25,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5609/5680 [13:42:48<09:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5786', 'grad_norm': '0.5392', 'learning_rate': '7.928e-08', 'ppl': '1.783', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 45948928, 'tokens/trainable': 45440076, 'epoch': '8.058'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5609/5680 [13:42:48<09:18,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5610/5680 [13:42:56<09:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3949', 'grad_norm': '0.4475', 'learning_rate': '7.71e-08', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 45957120, 'tokens/trainable': 45448216, 'epoch': '8.058'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 5610/5680 [13:42:56<09:10,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 5611/5680 [13:43:03<09:01,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3374', 'grad_norm': '0.4073', 'learning_rate': '7.494e-08', 'ppl': '1.401', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1047', 'tokens/total': 45965312, 'tokens/trainable': 45456400, 'epoch': '8.058'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 5611/5680 [13:43:03<09:01,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 5612/5680 [13:43:11<08:53,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.3153', 'grad_norm': '0.4967', 'learning_rate': '7.281e-08', 'ppl': '1.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45973504, 'tokens/trainable': 45464588, 'epoch': '8.058'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 5612/5680 [13:43:11<08:53,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 5613/5680 [13:43:19<08:46,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3162', 'grad_norm': '0.341', 'learning_rate': '7.072e-08', 'ppl': '1.372', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 45981696, 'tokens/trainable': 45472700, 'epoch': '8.059'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 5613/5680 [13:43:19<08:46,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5614/5680 [13:43:27<08:39,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4136', 'grad_norm': '0.3846', 'learning_rate': '6.866e-08', 'ppl': '1.512', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 45989888, 'tokens/trainable': 45480852, 'epoch': '8.059'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5614/5680 [13:43:27<08:39,  7.87s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5615/5680 [13:43:35<08:31,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5824', 'grad_norm': '0.4673', 'learning_rate': '6.662e-08', 'ppl': '1.79', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 45998080, 'tokens/trainable': 45489032, 'epoch': '8.059'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5615/5680 [13:43:35<08:31,  7.87s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5616/5680 [13:43:43<08:23,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4147', 'grad_norm': '0.4579', 'learning_rate': '6.462e-08', 'ppl': '1.514', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 46006272, 'tokens/trainable': 45497160, 'epoch': '8.059'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5616/5680 [13:43:43<08:23,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5617/5680 [13:43:51<08:15,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5472', 'grad_norm': '0.5038', 'learning_rate': '6.265e-08', 'ppl': '1.728', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 46014464, 'tokens/trainable': 45505272, 'epoch': '8.059'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5617/5680 [13:43:51<08:15,  7.87s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5618/5680 [13:43:58<08:07,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5075', 'grad_norm': '0.3958', 'learning_rate': '6.07e-08', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1046', 'tokens/total': 46022656, 'tokens/trainable': 45513448, 'epoch': '8.06'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5618/5680 [13:43:58<08:07,  7.86s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5619/5680 [13:44:06<07:58,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4567', 'grad_norm': '0.4844', 'learning_rate': '5.879e-08', 'ppl': '1.579', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 46030848, 'tokens/trainable': 45521568, 'epoch': '8.06'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5619/5680 [13:44:06<07:58,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5620/5680 [13:44:14<07:50,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6361', 'grad_norm': '0.4452', 'learning_rate': '5.691e-08', 'ppl': '1.889', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46039040, 'tokens/trainable': 45529728, 'epoch': '8.06'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5620/5680 [13:44:14<07:50,  7.85s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5621/5680 [13:44:22<07:43,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4069', 'grad_norm': '0.4016', 'learning_rate': '5.506e-08', 'ppl': '1.502', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 46047232, 'tokens/trainable': 45537836, 'epoch': '8.06'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 5621/5680 [13:44:22<07:43,  7.85s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 5622/5680 [13:44:30<07:36,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.2324', 'grad_norm': '0.3564', 'learning_rate': '5.324e-08', 'ppl': '1.262', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1023', 'tokens/total': 46055424, 'tokens/trainable': 45545904, 'epoch': '8.06'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 5622/5680 [13:44:30<07:36,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 5623/5680 [13:44:38<07:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6405', 'grad_norm': '0.4821', 'learning_rate': '5.145e-08', 'ppl': '1.897', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46063616, 'tokens/trainable': 45554060, 'epoch': '8.06'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 5623/5680 [13:44:38<07:27,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 5624/5680 [13:44:46<07:20,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5976', 'grad_norm': '0.4408', 'learning_rate': '4.969e-08', 'ppl': '1.818', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 46071808, 'tokens/trainable': 45562204, 'epoch': '8.061'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 5624/5680 [13:44:46<07:20,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5625/5680 [13:44:53<07:12,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2788', 'grad_norm': '0.4192', 'learning_rate': '4.796e-08', 'ppl': '1.321', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 46080000, 'tokens/trainable': 45570392, 'epoch': '8.061'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5625/5680 [13:44:53<07:12,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5626/5680 [13:45:01<07:05,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3096', 'grad_norm': '0.4455', 'learning_rate': '4.627e-08', 'ppl': '1.363', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46088192, 'tokens/trainable': 45578572, 'epoch': '8.061'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5626/5680 [13:45:01<07:05,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5627/5680 [13:45:09<06:57,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4906', 'grad_norm': '0.4468', 'learning_rate': '4.46e-08', 'ppl': '1.633', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 46096384, 'tokens/trainable': 45586704, 'epoch': '8.061'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5627/5680 [13:45:09<06:57,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5628/5680 [13:45:17<06:48,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6289', 'grad_norm': '0.4034', 'learning_rate': '4.296e-08', 'ppl': '1.875', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46104576, 'tokens/trainable': 45594832, 'epoch': '8.061'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 5628/5680 [13:45:17<06:48,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5629/5680 [13:45:25<06:41,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3989', 'grad_norm': '0.3558', 'learning_rate': '4.136e-08', 'ppl': '1.49', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 46112768, 'tokens/trainable': 45602984, 'epoch': '8.061'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5629/5680 [13:45:25<06:41,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5630/5680 [13:45:33<06:32,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.6383', 'grad_norm': '0.4738', 'learning_rate': '3.978e-08', 'ppl': '1.893', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1041', 'tokens/total': 46120960, 'tokens/trainable': 45611124, 'epoch': '8.062'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5630/5680 [13:45:33<06:32,  7.85s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5631/5680 [13:45:41<06:24,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.446', 'grad_norm': '0.4043', 'learning_rate': '3.824e-08', 'ppl': '1.562', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 46129152, 'tokens/trainable': 45619232, 'epoch': '8.062'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5631/5680 [13:45:41<06:24,  7.85s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5632/5680 [13:45:48<06:16,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4168', 'grad_norm': '0.4216', 'learning_rate': '3.672e-08', 'ppl': '1.517', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 46137344, 'tokens/trainable': 45627368, 'epoch': '8.062'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 5632/5680 [13:45:48<06:16,  7.85s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 5633/5680 [13:45:56<06:09,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3035', 'grad_norm': '0.3464', 'learning_rate': '3.524e-08', 'ppl': '1.355', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 46145536, 'tokens/trainable': 45635548, 'epoch': '8.062'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 5633/5680 [13:45:56<06:09,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 5634/5680 [13:46:04<06:01,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3631', 'grad_norm': '0.4344', 'learning_rate': '3.379e-08', 'ppl': '1.438', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 46153728, 'tokens/trainable': 45643720, 'epoch': '8.062'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 5634/5680 [13:46:04<06:01,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 5635/5680 [13:46:12<05:54,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.2035', 'grad_norm': '0.3661', 'learning_rate': '3.236e-08', 'ppl': '1.226', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 46161920, 'tokens/trainable': 45651852, 'epoch': '8.062'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 5635/5680 [13:46:12<05:54,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5636/5680 [13:46:20<05:46,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3403', 'grad_norm': '0.3748', 'learning_rate': '3.097e-08', 'ppl': '1.405', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 46170112, 'tokens/trainable': 45660016, 'epoch': '8.063'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5636/5680 [13:46:20<05:46,  7.88s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5637/5680 [13:46:28<05:38,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5088', 'grad_norm': '0.4091', 'learning_rate': '2.961e-08', 'ppl': '1.663', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 46178304, 'tokens/trainable': 45668152, 'epoch': '8.063'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5637/5680 [13:46:28<05:38,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5638/5680 [13:46:36<05:30,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3157', 'grad_norm': '0.4122', 'learning_rate': '2.828e-08', 'ppl': '1.371', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 46186496, 'tokens/trainable': 45676272, 'epoch': '8.063'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5638/5680 [13:46:36<05:30,  7.88s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5639/5680 [13:46:44<05:22,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3884', 'grad_norm': '0.4032', 'learning_rate': '2.698e-08', 'ppl': '1.475', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 46194688, 'tokens/trainable': 45684440, 'epoch': '8.063'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5639/5680 [13:46:44<05:22,  7.88s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5640/5680 [13:46:51<05:14,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4916', 'grad_norm': '0.4173', 'learning_rate': '2.571e-08', 'ppl': '1.635', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46202880, 'tokens/trainable': 45692600, 'epoch': '8.063'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5640/5680 [13:46:51<05:14,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5641/5680 [13:46:59<05:06,  7.85s/it]                                                                                                                                                                                                                                             {'loss': '0.4387', 'grad_norm': '0.3893', 'learning_rate': '2.447e-08', 'ppl': '1.551', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1050', 'tokens/total': 46211072, 'tokens/trainable': 45700788, 'epoch': '8.064'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5641/5680 [13:46:59<05:06,  7.85s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5642/5680 [13:47:07<04:58,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5591', 'grad_norm': '0.3979', 'learning_rate': '2.326e-08', 'ppl': '1.749', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 46219264, 'tokens/trainable': 45708932, 'epoch': '8.064'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5642/5680 [13:47:07<04:58,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5643/5680 [13:47:15<04:51,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3138', 'grad_norm': '0.3721', 'learning_rate': '2.209e-08', 'ppl': '1.369', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 46227456, 'tokens/trainable': 45717080, 'epoch': '8.064'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 5643/5680 [13:47:15<04:51,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 5644/5680 [13:47:23<04:43,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6948', 'grad_norm': '0.4073', 'learning_rate': '2.094e-08', 'ppl': '2.003', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46235648, 'tokens/trainable': 45725248, 'epoch': '8.064'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 5644/5680 [13:47:23<04:43,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 5645/5680 [13:47:31<04:35,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5075', 'grad_norm': '0.4165', 'learning_rate': '1.982e-08', 'ppl': '1.661', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 46243840, 'tokens/trainable': 45733372, 'epoch': '8.064'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 5645/5680 [13:47:31<04:35,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 5646/5680 [13:47:39<04:27,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3908', 'grad_norm': '0.4549', 'learning_rate': '1.874e-08', 'ppl': '1.478', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46252032, 'tokens/trainable': 45741544, 'epoch': '8.064'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 5646/5680 [13:47:39<04:27,  7.86s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5647/5680 [13:47:46<04:19,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3467', 'grad_norm': '0.3956', 'learning_rate': '1.768e-08', 'ppl': '1.414', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46260224, 'tokens/trainable': 45749732, 'epoch': '8.065'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5647/5680 [13:47:46<04:19,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5648/5680 [13:47:54<04:11,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5114', 'grad_norm': '0.4259', 'learning_rate': '1.666e-08', 'ppl': '1.668', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 46268416, 'tokens/trainable': 45757860, 'epoch': '8.065'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5648/5680 [13:47:54<04:11,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5649/5680 [13:48:02<04:03,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3875', 'grad_norm': '0.3785', 'learning_rate': '1.566e-08', 'ppl': '1.473', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46276608, 'tokens/trainable': 45766020, 'epoch': '8.065'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5649/5680 [13:48:02<04:03,  7.87s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5650/5680 [13:48:10<03:56,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5262', 'grad_norm': '0.455', 'learning_rate': '1.47e-08', 'ppl': '1.693', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46284800, 'tokens/trainable': 45774192, 'epoch': '8.065'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 5650/5680 [13:48:10<03:56,  7.87s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5651/5680 [13:48:18<03:48,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.5498', 'grad_norm': '0.4684', 'learning_rate': '1.377e-08', 'ppl': '1.733', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 46292992, 'tokens/trainable': 45782312, 'epoch': '8.065'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5651/5680 [13:48:18<03:48,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5652/5680 [13:48:26<03:40,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.311', 'grad_norm': '0.4387', 'learning_rate': '1.286e-08', 'ppl': '1.365', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1029', 'tokens/total': 46301184, 'tokens/trainable': 45790428, 'epoch': '8.065'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5652/5680 [13:48:26<03:40,  7.88s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5653/5680 [13:48:34<03:32,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4466', 'grad_norm': '0.4326', 'learning_rate': '1.199e-08', 'ppl': '1.563', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1032', 'tokens/total': 46309376, 'tokens/trainable': 45798584, 'epoch': '8.066'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5653/5680 [13:48:34<03:32,  7.89s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5654/5680 [13:48:42<03:24,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3752', 'grad_norm': '0.396', 'learning_rate': '1.115e-08', 'ppl': '1.455', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46317568, 'tokens/trainable': 45806712, 'epoch': '8.066'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 5654/5680 [13:48:42<03:24,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5655/5680 [13:48:50<03:18,  7.94s/it]                                                                                                                                                                                                                                             {'loss': '0.3842', 'grad_norm': '0.3946', 'learning_rate': '1.034e-08', 'ppl': '1.468', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '992.2', 'tokens/total': 46325760, 'tokens/trainable': 45814764, 'epoch': '8.066'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5655/5680 [13:48:50<03:18,  7.94s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5656/5680 [13:48:58<03:09,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.3572', 'grad_norm': '0.4863', 'learning_rate': '9.56e-09', 'ppl': '1.429', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1042', 'tokens/total': 46333952, 'tokens/trainable': 45822944, 'epoch': '8.066'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5656/5680 [13:48:58<03:09,  7.91s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5657/5680 [13:49:05<03:01,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.5055', 'grad_norm': '0.3915', 'learning_rate': '8.81e-09', 'ppl': '1.658', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1031', 'tokens/total': 46342144, 'tokens/trainable': 45831080, 'epoch': '8.066'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 5657/5680 [13:49:05<03:01,  7.91s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5658/5680 [13:49:13<02:54,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.4079', 'grad_norm': '0.4334', 'learning_rate': '8.091e-09', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 46350336, 'tokens/trainable': 45839256, 'epoch': '8.067'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5658/5680 [13:49:13<02:54,  7.91s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5659/5680 [13:49:21<02:45,  7.90s/it]                                                                                                                                                                                                                                             {'loss': '0.3507', 'grad_norm': '0.4567', 'learning_rate': '7.403e-09', 'ppl': '1.42', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46358528, 'tokens/trainable': 45847432, 'epoch': '8.067'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5659/5680 [13:49:21<02:45,  7.90s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5660/5680 [13:49:29<02:37,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.3135', 'grad_norm': '0.4124', 'learning_rate': '6.745e-09', 'ppl': '1.368', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1030', 'tokens/total': 46366720, 'tokens/trainable': 45855536, 'epoch': '8.067'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5660/5680 [13:49:29<02:37,  7.89s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5661/5680 [13:49:37<02:29,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3046', 'grad_norm': '0.3828', 'learning_rate': '6.118e-09', 'ppl': '1.356', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1026', 'tokens/total': 46374912, 'tokens/trainable': 45863600, 'epoch': '8.067'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 5661/5680 [13:49:37<02:29,  7.88s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5662/5680 [13:49:45<02:21,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.5463', 'grad_norm': '0.4636', 'learning_rate': '5.522e-09', 'ppl': '1.727', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 46383104, 'tokens/trainable': 45871752, 'epoch': '8.067'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5662/5680 [13:49:45<02:21,  7.86s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5663/5680 [13:49:53<02:13,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.314', 'grad_norm': '0.3298', 'learning_rate': '4.956e-09', 'ppl': '1.369', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1028', 'tokens/total': 46391296, 'tokens/trainable': 45879868, 'epoch': '8.067'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5663/5680 [13:49:53<02:13,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5664/5680 [13:50:01<02:06,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.4083', 'grad_norm': '0.4113', 'learning_rate': '4.42e-09', 'ppl': '1.504', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 46399488, 'tokens/trainable': 45888048, 'epoch': '8.068'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5664/5680 [13:50:01<02:06,  7.88s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5665/5680 [13:50:08<01:58,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.3944', 'grad_norm': '0.3913', 'learning_rate': '3.916e-09', 'ppl': '1.484', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 46407680, 'tokens/trainable': 45896196, 'epoch': '8.068'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 5665/5680 [13:50:08<01:58,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 5666/5680 [13:50:16<01:50,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.359', 'grad_norm': '0.3614', 'learning_rate': '3.442e-09', 'ppl': '1.432', 'memory/max_active (GiB)': '9.07', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46415872, 'tokens/trainable': 45904352, 'epoch': '8.068'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 5666/5680 [13:50:16<01:50,  7.86s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 5667/5680 [13:50:24<01:42,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.4416', 'grad_norm': '0.4494', 'learning_rate': '2.998e-09', 'ppl': '1.555', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 46424064, 'tokens/trainable': 45912488, 'epoch': '8.068'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 5667/5680 [13:50:24<01:42,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 5668/5680 [13:50:32<01:34,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.525', 'grad_norm': '0.5121', 'learning_rate': '2.585e-09', 'ppl': '1.69', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1039', 'tokens/total': 46432256, 'tokens/trainable': 45920672, 'epoch': '8.068'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 5668/5680 [13:50:32<01:34,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5669/5680 [13:50:40<01:26,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.4483', 'grad_norm': '0.4055', 'learning_rate': '2.203e-09', 'ppl': '1.566', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1044', 'tokens/total': 46440448, 'tokens/trainable': 45928860, 'epoch': '8.068'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5669/5680 [13:50:40<01:26,  7.86s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5670/5680 [13:50:48<01:18,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.3493', 'grad_norm': '0.4191', 'learning_rate': '1.851e-09', 'ppl': '1.418', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 46448640, 'tokens/trainable': 45936988, 'epoch': '8.069'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5670/5680 [13:50:48<01:18,  7.86s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5671/5680 [13:50:56<01:10,  7.86s/it]                                                                                                                                                                                                                                             {'loss': '0.6624', 'grad_norm': '0.4452', 'learning_rate': '1.53e-09', 'ppl': '1.939', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1036', 'tokens/total': 46456832, 'tokens/trainable': 45945124, 'epoch': '8.069'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5671/5680 [13:50:56<01:10,  7.86s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5672/5680 [13:51:03<01:02,  7.87s/it]                                                                                                                                                                                                                                             {'loss': '0.409', 'grad_norm': '0.4589', 'learning_rate': '1.239e-09', 'ppl': '1.505', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1027', 'tokens/total': 46465024, 'tokens/trainable': 45953224, 'epoch': '8.069'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 5672/5680 [13:51:03<01:02,  7.87s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5673/5680 [13:51:12<00:55,  7.96s/it]                                                                                                                                                                                                                                             {'loss': '0.5418', 'grad_norm': '0.4087', 'learning_rate': '9.789e-10', 'ppl': '1.719', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '990', 'tokens/total': 46473216, 'tokens/trainable': 45961312, 'epoch': '8.069'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5673/5680 [13:51:12<00:55,  7.96s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5674/5680 [13:51:19<00:47,  7.93s/it]                                                                                                                                                                                                                                             {'loss': '0.3955', 'grad_norm': '0.5441', 'learning_rate': '7.495e-10', 'ppl': '1.485', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 46481408, 'tokens/trainable': 45969440, 'epoch': '8.069'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5674/5680 [13:51:19<00:47,  7.93s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5675/5680 [13:51:27<00:39,  7.91s/it]                                                                                                                                                                                                                                             {'loss': '0.6302', 'grad_norm': '0.5368', 'learning_rate': '5.506e-10', 'ppl': '1.878', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1038', 'tokens/total': 46489600, 'tokens/trainable': 45977616, 'epoch': '8.07'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5675/5680 [13:51:27<00:39,  7.91s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5676/5680 [13:51:35<00:31,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.4923', 'grad_norm': '0.4399', 'learning_rate': '3.824e-10', 'ppl': '1.636', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1034', 'tokens/total': 46497792, 'tokens/trainable': 45985728, 'epoch': '8.07'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 5676/5680 [13:51:35<00:31,  7.89s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5677/5680 [13:51:43<00:23,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.3789', 'grad_norm': '0.4208', 'learning_rate': '2.447e-10', 'ppl': '1.461', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1033', 'tokens/total': 46505984, 'tokens/trainable': 45993856, 'epoch': '8.07'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5677/5680 [13:51:43<00:23,  7.88s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5678/5680 [13:51:51<00:15,  7.88s/it]                                                                                                                                                                                                                                             {'loss': '0.5715', 'grad_norm': '0.4643', 'learning_rate': '1.377e-10', 'ppl': '1.771', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1040', 'tokens/total': 46514176, 'tokens/trainable': 46002036, 'epoch': '8.07'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5678/5680 [13:51:51<00:15,  7.88s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5679/5680 [13:51:59<00:07,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.2234', 'grad_norm': '0.3995', 'learning_rate': '6.118e-11', 'ppl': '1.25', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1035', 'tokens/total': 46522368, 'tokens/trainable': 46010212, 'epoch': '8.07'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5679/5680 [13:51:59<00:07,  7.89s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5680/5680 [13:52:07<00:00,  7.89s/it]                                                                                                                                                                                                                                             {'loss': '0.6636', 'grad_norm': '0.5041', 'learning_rate': '1.53e-11', 'ppl': '1.942', 'memory/max_active (GiB)': '9.05', 'memory/max_allocated (GiB)': '9.05', 'memory/device_reserved (GiB)': '15.55', 'tokens/train_per_sec_per_gpu': '1037', 'tokens/total': 46530560, 'tokens/trainable': 46018384, 'epoch': '8.07'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5680/5680 [13:52:07<00:00,  7.89s/it][2026-01-27 11:41:20,782] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 11:42:06,722] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out/checkpoint-5680
[2026-01-27 11:43:00,647] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:860: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2026-01-27 11:43:00,647] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:904: UserWarning: Multiple backends are registered with this ProcessGroup. We cannot determine which one is the default. Returning cpu. Please consider using other APIs.
  warnings.warn(

                                                                                                                                                                                                                                             {'train_runtime': '5.003e+04', 'train_samples_per_second': '0.227', 'train_steps_per_second': '0.114', 'train_loss': '0.5811', 'memory/max_active (GiB)': '1.79', 'memory/max_allocated (GiB)': '1.79', 'memory/device_reserved (GiB)': '15.55', 'epoch': '8.07', 'tokens/train_per_sec_per_gpu': '0'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5680/5680 [13:53:47<00:00,  7.89s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5680/5680 [13:53:47<00:00,  8.81s/it]
[2026-01-27 11:43:01,319] [INFO] [axolotl.train.save_trained_model:233] [PID:58141] Training completed! Saving trained model to ./outputs/qlora-out.
[2026-01-27 11:43:01,327] [WARNING] [py.warnings._showwarnmsg:109] [PID:58141] /apool/venvi/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:675: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
  warnings.warn(

[2026-01-27 11:43:48,504] [INFO] [axolotl.core.trainers.base._save:721] [PID:58141] Saving model checkpoint to ./outputs/qlora-out
[2026-01-27 11:43:51,617] [INFO] [axolotl.train.save_trained_model:351] [PID:58141] Model successfully saved to ./outputs/qlora-out